diff --git a/CHANGELOG.md b/CHANGELOG.md
index 367d6935..b92893e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,22 @@
 
 # CUTLASS 2.x
 
+## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08)
+ * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
+   * Fast Tensor Core operations: 
+    * Maximum performance via [`mma.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
+    * Tensor Float 32, BFloat16, and double-precision data types
+    * Mixed integer data types (int8, int4, bin1)
+   * Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution)   
+ * Features:
+   * SDK examples showing GEMM fused with bias+relu and fused GEMM+GEMM
+   * Complex-valued GEMMs targeting NVIDIA Ampere Tensor Cores in double-precision and Tensor Float 32
+   * Gaussian complex GEMMs using 3m complex multiply algorithm
+   * Universal GEMM kernel supporting two batch modes and two algorithms for parallel reductions
+ * Policy updates:
+   * [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) needed to enable NVIDIA Ampere Architecture features
+   * Disabled F16C by default for compatibility - enable on cmake command line with `-DCUTLASS_ENABLE_F16C=ON`
+
 ## [2.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.1.0) (2020-04-06)
  * BLAS-style host-side API added to [CUTLASS Library](/media/docs/quickstart.md#cutlass-library)
     * API to launch compiled kernel instances for GEMM and planar complex GEMM
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b7bbc48..b6583747 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -32,7 +32,7 @@ endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
-project(CUTLASS VERSION 2.1.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.2.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 find_package(Doxygen QUIET)
@@ -84,7 +84,7 @@ endif()
 
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
 if (NOT CUDA_VERSION VERSION_LESS 7.5)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 50)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 53)
 endif()
 if (NOT CUDA_VERSION VERSION_LESS 8.0)
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 60 61)
@@ -98,6 +98,9 @@ endif()
 if (NOT CUDA_VERSION VERSION_LESS 10.0)
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 75)
 endif()
+if (NOT CUDA_VERSION VERSION_LESS 11.0)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
+endif()
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
 set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")
 
@@ -154,7 +157,7 @@ endif()
 set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
 set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
 set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
-set(CUTLASS_ENABLE_F16C ON CACHE BOOL "Enable F16C x86 extensions in host code.")
+set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")
 
 #
 # CUTLASS generator cmake configuration
@@ -248,8 +251,8 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
   endif()
 
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
-  list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm=-pragma-unroll-threshold=100000)
-  list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm=-unroll-threshold=5000)
+  list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -pragma-unroll-threshold=100000)
+  list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000)
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument)
 
   string(REPLACE "." ";" CUDA_VERSION_PARTS ${CMAKE_CUDA_COMPILER_VERSION})
@@ -271,7 +274,7 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
   set(NVCC_FLAGS)
   set(CLANG_FLAGS)
   foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED})
-    list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
+    list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
     set(CODES)
     if(CUTLASS_NVCC_EMBED_CUBIN)
       list(APPEND CODES sm_${ARCH})
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index fc95674d..f8778b80 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -9,15 +9,17 @@ This is the official list of CUTLASS developers and contributors.
 ## DEVELOPERS
 Andrew Kerr  
 Haicheng Wu  
-Naila Farooqui  
+Manish Gupta  
 Dustyn Blasig  
 Pradeep Ramani  
-Manish Gupta  
-Aditya Atluri  
+Naila Farooqui  
+Piotr Majcher  
 Paul Springer  
-David Tanner  
-Scott Yokim      
 Jin Wang   
+Scott Yokim      
+Markus Hohnerbach  
+Aditya Atluri  
+David Tanner  
   
 ## CONTRIBUTORS
 Timothy Costa  
@@ -25,12 +27,10 @@ Julien Demouth
 Brian Fahs  
 Michael Goldfarb  
 Mostafa Hagog  
-Markus Hohnerbach  
 Fei Hu  
 Alan Kaatz  
 Tina Li  
 Timmy Liu  
-Piotr Majcher  
 Duane Merrill  
 Kevin Siu  
 Markus Tavenrath  
diff --git a/CUDA.cmake b/CUDA.cmake
index d1eb4dbc..b8b343a7 100644
--- a/CUDA.cmake
+++ b/CUDA.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -206,14 +206,14 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 function(cutlass_correct_source_file_language_property)
   if(CUDA_COMPILER MATCHES "clang")
     foreach(File ${ARGN})
-      if(${File} MATCHES ".*\.cu$")
+      if(File MATCHES ".*\.cu$")
         set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
       endif()
     endforeach()
   endif()
 endfunction()
 
-set(CUTLASS_UNITY_BUILD_ENABLED ON CACHE BOOL "Enable combined source compilation")
+set(CUTLASS_UNITY_BUILD_ENABLED OFF CACHE BOOL "Enable combined source compilation")
 set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files")
 
 function(cutlass_unify_source_files TARGET_ARGS_VAR)
diff --git a/LICENSE.txt b/LICENSE.txt
index 283345b5..64a49d68 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2017 - 2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2020, NVIDIA CORPORATION.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/README.md b/README.md
index dd1c4c65..c1507c03 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.1
+# CUTLASS 2.2
 
-_CUTLASS 2.1 - April 2020_
+_CUTLASS 2.2 - June 2020_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -17,14 +17,28 @@ and applications.
 To support a wide variety of applications, CUTLASS provides extensive support for
 mixed-precision computations, providing specialized data-movement and
 multiply-accumulate abstractions for half-precision floating
-point (FP16), single-precision floating point (FP32), double-precision floating
+point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32),
+single-precision floating point (FP32), double-precision floating
 point (FP64) types, integer data types (4b and 8b), and binary data types (1b). 
-Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations for 
+
+Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations 
 targeting the  programmable, high-throughput _Tensor Cores_ implemented by 
-NVIDIA's Volta and Turing architectures.
+NVIDIA's Volta, Turing, and Ampere architectures.
 
 See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
 
+See the [functionality listing](media/docs/functionality.md) for the list of operations
+supported at each level of the execution model hierarchy.
+
+# What's New in CUTLASS 2.2
+
+CUTLASS 2.2 is a significant update to CUTLASS adding:
+
+- Coverage of [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
+- Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types
+- Deep software pipelines using asynchronous copy
+- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+
 # What's New in CUTLASS 2.1
 
 CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding:
@@ -32,7 +46,6 @@ CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding:
 - [Planar complex GEMM kernels](/examples/10_planar_complex/planar_complex.cu) targeting Volta and Turing Tensor Cores
 - BLAS-style API to launch kernels compiled into the [CUTLASS Library](/media/docs/quickstart.md#cutlass-library)
 
-
 # What's New in CUTLASS 2.0
 
 CUTLASS 2.0 is a substantial refactoring from the previous version, intended to offer:
@@ -43,9 +56,6 @@ CUTLASS 2.0 is a substantial refactoring from the previous version, intended to
 
 **See the [CHANGELOG](CHANGELOG.md) for more details.**
 
-See the [functionality listing](media/docs/functionality.md) for the list of operations
-supported at each level of the execution model hierarchy.
-
 # Performance
 
 <p align="center"><img src=/media/images/cutlass-performance-plot.png></p>
@@ -53,15 +63,15 @@ supported at each level of the execution model hierarchy.
 CUTLASS primitives are very efficient.  When used to construct device-wide GEMM kernels,
 they exhibit performance comparable to cuBLAS for scalar GEMM
 computations. The above figure shows CUTLASS performance relative to cuBLAS
-for large matrix dimensions on an NVIDIA GeForce 2080 Ti and an NVIDIA TitanV
-using CUDA 10.2. Tensor Core operations are implemented using CUDA's 
+for large matrix dimensions on an NVIDIA GeForce 2080 Ti, an NVIDIA A100, and an NVIDIA TitanV
+using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's 
 [mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma).
 
 # Compatibility
 
 CUTLASS requires a C++11 host compiler and 
-performs best when built with the [CUDA 10.2 Toolkit](https://developer.nvidia.com/cuda-toolkit).
-It is compatible with CUDA 9.2, CUDA 10.0, and CUDA 10.1.
+performs best when built with the [CUDA 11.0 Toolkit](https://developer.nvidia.com/cuda-toolkit).
+It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, and CUDA 10.2.
 
 We have tested the following environments.
 
@@ -70,27 +80,28 @@ We have tested the following environments.
 | Windows 10      | Microsoft Visual Studio 2015|
 |                 | Microsoft Visual Studio 2017|
 | Ubuntu 16.04 | GCC 5.4.0 |
-| Ubuntu 18.04 | GCC 7.3.0 |
+| Ubuntu 18.04 | GCC 7.5.0 |
 
 Additionally, CUTLASS may be built with clang. 
 See [these instructions](media/docs/quickstart.md#clang) for more details.
 
 CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
-any Maxwell-, Pascal-, Volta-, or Turing- architecture NVIDIA GPU.
+any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU.
 
-|**GPU**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**|
-|---|---|---|
-|NVIDIA GeForce 1080|9.2|  |
-|NVIDIA TitanXP|9.2|  |
-|NVIDIA Tesla P100|9.2|  |
-|NVIDIA Tesla V100|9.2|10.1|
-|NVIDIA TitanV|9.2|10.1|
-|NVIDIA GeForce RTX 2080 TI, 2080, 2070|10.0|10.2|
-|NVIDIA Tesla T4|10.0|10.2|
+|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**|
+|---|---|---|---|
+|NVIDIA Tesla P100|6.0|9.2|  |
+|NVIDIA GeForce 1080|6.1|9.2|  |
+|NVIDIA TitanXP|6.1|9.2|  |
+|NVIDIA Tesla V100|7.0|9.2|10.1|
+|NVIDIA TitanV|7.0|9.2|10.1|
+|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2|
+|NVIDIA Tesla T4|7.5|10.0|10.2|
+|NVIDIA A100|8.0|11.0|11.0|
 
 # Documentation
 
-CUTLASS 2.1 is described in the following documents and the accompanying
+CUTLASS 2.2 is described in the following documents and the accompanying
 [Doxygen documentation](https://nvidia.github.io/cutlass).
 
 - [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
@@ -124,7 +135,7 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 ```
 
 Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels
-for CUDA architecture versions 5.0, 6.0, 6.1, 7.0 and 7.5. To reduce compile time you can specify
+for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, and 8.0. To reduce compile time you can specify
 the architectures to build CUTLASS for by changing the CMake configuration setting
 `CUTLASS_NVCC_ARCHS`.
 
@@ -210,6 +221,10 @@ examples/
   10_planar_complex/         # example demonstrating planar complex GEMM kernels
 
   11_planar_complex_array/   # example demonstrating planar complex kernels with batch-specific problem sizes
+
+  12_gemm_bias_relu/         # example demonstrating GEMM fused with bias and relu
+
+  13_fused_two_gemms/        # example demonstrating two GEMms fused in one kernel
 ```
 
 ### Tools
@@ -255,29 +270,32 @@ $ make cutlass_profiler -j
 
 Example command line for profiling SGEMM kernels is as follows:
 ```
-$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096
+$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
 
 =============================
   Problem ID: 1
 
-    Provider: CUTLASS
-   Operation: cutlass_simt_sgemm_128x128_nn
+        Provider: CUTLASS
+   OperationKind: gemm
+       Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1
 
- Disposition: Passed
-      Status: Success
+          Status: Success
+    Verification: ON
+     Disposition: Passed
 
-   Arguments:  --m=4352 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0  \
-               --split_k_slices=1 --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8  \
-               --stages=2 --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50  \
-               --max_cc=1024
+          cuBLAS: Passed
 
-       Bytes: 52428800  bytes
-       FLOPs: 146064539648  flops
+       Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1  \
+                  --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
 
-     Runtime: 10.5424  ms
-      Memory: 4.63158 GiB/s
+           Bytes: 180355072  bytes
+           FLOPs: 115992428544  flops
 
-        Math: 13854.9 GFLOP/s
+         Runtime: 6.73655  ms
+          Memory: 24.934 GiB/s
+
+            Math: 17218.4 GFLOP/s
 ```
 
 [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
diff --git a/cmake/nop.cu b/cmake/nop.cu
index 571c6c7c..518a582b 100644
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/cuBLAS.cmake b/cuBLAS.cmake
index d7f330cf..4c73a1db 100644
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@@ -10,28 +10,35 @@ if((DEFINED CUTLASS_ENABLE_CUBLAS AND NOT CUTLASS_ENABLE_CUBLAS) OR
   message(STATUS "cuBLAS Disabled.")
 
 elseif(NOT TARGET cublas)
-  
+ 
   find_path(
-    _CUBLAS_INCLUDE_DIR cublas.h
-    PATHS
-    ${CUDA_TOOLKIT_ROOT_DIR}/include
-    $ENV{CUBLAS_PATH}/include
-    $ENV{CUDA_PATH}/include
-    ${CUBLAS_PATH}/include
-    /usr/include)
+    _CUBLAS_INCLUDE_DIR
+    NAMES cublas.h
+    HINTS
+      ${CUBLAS_INCLUDE_PATH}
+      ENV CUBLAS_INCLUDE_PATH
+      ${CUBLAS_PATH}
+      ENV CUBLAS_PATH
+      ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES
+      include
+    )
 
   find_library(
-    _CUBLAS_LIBRARY cublas
+    _CUBLAS_LIBRARY
+    NAMES cublas
     HINTS
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-    $ENV{CUBLAS_PATH}/lib64
-    $ENV{CUBLAS_PATH}/lib/x64
-    $ENV{CUDA_PATH}/lib64
-    $ENV{CUDA_PATH}/lib/x64
-    ${CUBLAS_PATH}/lib64
-    ${CUBLAS_PATH}/lib/x64
-    /usr/lib/x86_64-linux-gnu)
+      ${CUBLAS_LIBRARY_PATH}
+      ENV CUBLAS_LIBRARY_PATH
+      ${_CUBLAS_INCLUDE_DIR}/..
+      ${CUBLAS_PATH}
+      ENV CUBLAS_PATH
+      ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES
+      lib64
+      lib/x64
+      lib
+    )
 
   if(_CUBLAS_INCLUDE_DIR AND _CUBLAS_LIBRARY)
 
@@ -79,17 +86,20 @@ if(CUTLASS_ENABLE_CUBLAS AND NOT TARGET cublas)
     $<BUILD_INTERFACE:${CUBLAS_INCLUDE_DIR}>)
 
   find_library(
-    _CUBLASLT_LIBRARY cublasLt
+    _CUBLASLT_LIBRARY
+    NAMES cublasLt
     HINTS
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-    $ENV{CUBLAS_PATH}/lib64
-    $ENV{CUBLAS_PATH}/lib/x64
-    $ENV{CUDA_PATH}/lib64
-    $ENV{CUDA_PATH}/lib/x64
-    ${CUBLAS_PATH}/lib64
-    ${CUBLAS_PATH}/lib/x64
-    /usr/lib/x86_64-linux-gnu)
+      ${CUBLAS_LIBRARY_PATH}
+      ENV CUBLAS_LIBRARY_PATH
+      ${_CUBLAS_INCLUDE_DIR}/..
+      ${CUBLAS_PATH}
+      ENV CUBLAS_PATH
+      ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES
+      lib64
+      lib/x64
+      lib
+    )
 
   if(_CUBLASLT_LIBRARY AND NOT TARGET cublasLt)
 
@@ -106,6 +116,8 @@ if(CUTLASS_ENABLE_CUBLAS AND NOT TARGET cublas)
   
     add_library(nvidia::cublasLt ALIAS cublasLt)
 
+    target_link_libraries(cublas INTERFACE cublasLt)
+
   endif()
 
 endif()
diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt
index 5b833b85..9ae257d9 100644
--- a/examples/00_basic_gemm/CMakeLists.txt
+++ b/examples/00_basic_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
index 41564632..bda012ab 100644
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt
index 2dfa083c..5f22b7b1 100644
--- a/examples/01_cutlass_utilities/CMakeLists.txt
+++ b/examples/01_cutlass_utilities/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu
index 0b6aa386..d1eaa57f 100644
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt
index 4e9af4fb..5e6112e0 100644
--- a/examples/02_dump_reg_shmem/CMakeLists.txt
+++ b/examples/02_dump_reg_shmem/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
index 39d58db8..ed712aa8 100644
--- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu
+++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt
index 81211df9..5a08c0f8 100644
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h
index c72b1228..dd7de198 100644
--- a/examples/03_visualize_layout/options.h
+++ b/examples/03_visualize_layout/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu
index 655d1f37..0d2b25eb 100644
--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -34,6 +34,8 @@
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm70.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
 #include "visualize_layout.h"
 #include "register_layout.h"
 
@@ -59,18 +61,40 @@ void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase>
       // Integer matrix multiply.int4 8832  TN kblock128
       {"TensorOpMultiplicand<4,128>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 128>>},
+      // Integer matrix multiply.int4 16864 TN kblock256
+      {"TensorOpMultiplicand<4,256>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 256>>},
       // Integer matrix multiply 8816  Interleaved-32
       {"TensorOpMultiplicand<8,32>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 32>>},
       // Integer matrix multiply 8816  TN kblock64
       {"TensorOpMultiplicand<8,64>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 64>>},
+      {"TensorOpMultiplicand<8,128>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 128>>},
       // Matrix Multiply 1688  TN kblock32
       {"TensorOpMultiplicand<16,32>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 32>>},
       // Matrix multiply 1688  NT
       {"TensorOpMultiplicand<16,64>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 64>>},
+      // Matrix multiply 1688.TF32 TN kblock16
+      {"TensorOpMultiplicand<32,16>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 16>>},
+      // Matrix multiply 1688.TF32 TN kblock32
+      {"TensorOpMultiplicand<32,32>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 32>>},
+      // Matrix multiply 1688 NT
+      {"TensorOpMultiplicandCongruous<32,32>",
+       new VisualizeLayout<
+           cutlass::layout::TensorOpMultiplicandCongruous<32, 32>>},
+      // Matrix multiply 884 NT
+      {"TensorOpMultiplicandCongruous<64,16>",
+       new VisualizeLayout<
+           cutlass::layout::TensorOpMultiplicandCongruous<64, 16>>},
+      // Matrix multiply 884 TN
+      {"TensorOpMultiplicand64bCrosswise",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand64bCrosswise>},
       {"TensorOpMultiplicandCongruous<128,4>",
        new VisualizeLayout<
            cutlass::layout::TensorOpMultiplicandCongruous<128, 4>>},
@@ -82,7 +106,7 @@ void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase>
            cutlass::layout::VoltaTensorOpMultiplicandCongruous<16>>},
       {"VoltaTensorOpMultiplicandCrosswise<16,32>",
        new VisualizeLayout<
-           cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>},
+           cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>}
   };
 
   for (auto layout : layout_pairs) {
diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h
index fee911f7..1518e433 100644
--- a/examples/03_visualize_layout/register_layout.h
+++ b/examples/03_visualize_layout/register_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp
index 8908d2c1..a0f27181 100644
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -65,14 +65,26 @@ void print_usage(std::ostream &out) {
          "--extent=64,64 --vectorize=32 --output-shape=256,4\n"
       << "$ 03_visualize_layout \"TensorOpMultiplicand<4,128>\" "
          "--extent=128,32 --vectorize=32 --output-shape=256,4\n"
+      << "$ 03_visualize_layout \"TensorOpMultiplicand<4,256>\" "
+         "--extent=256,16 --vectorize=32 --output-shape=256,4\n"
       << "$ 03_visualize_layout \"TensorOpMultiplicand<8,32>\" "
          "--extent=32,64 --vectorize=16 --output-shape=128,4\n"
       << "$ 03_visualize_layout \"TensorOpMultiplicand<8,64>\" "
          "--extent=64,32 --vectorize=16 --output-shape=128,4\n"
+      << "$ 03_visualize_layout \"TensorOpMultiplicand<8,128>\" "
+         "--extent=128,16 --vectorize=16 --output-shape=128,4\n"
       << "$ 03_visualize_layout \"TensorOpMultiplicand<16,32>\" "
          "--extent=32,32 --vectorize=8 --output-shape=64,4\n"
       << "$ 03_visualize_layout \"TensorOpMultiplicand<16,64>\" "
          "--extent=64,16 --vectorize=8 --output-shape=64,4\n"
+      << "$ 03_visualize_layout \"TensorOpMultiplicand<32,16>\" "
+         "--extent=16,32 --vectorize=4 --output-shape=32,4\n"
+      << "$ 03_visualize_layout \"TensorOpMultiplicand<32,32>\" "
+         "--extent=32,16 --vectorize=4 --output-shape=32,4\n"
+      << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<32,32>\" "
+         "--extent=32,16 --vectorize=4 --output-shape=32,4\n"
+      << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<64, 16>\" "
+         "--extent=16,16 --vectorize=2 --output-shape=16,4\n"
       << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCrosswise<16,32>\" "
          "--extent=32,64 --vectorize=4 --output-shape=64,4\n"
       << "$ 03_visualize_layout \"VotlaTensorOpMultiplicandCongruous<16>\" "
diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h
index 031916c7..4093d277 100644
--- a/examples/03_visualize_layout/visualize_layout.h
+++ b/examples/03_visualize_layout/visualize_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt
index cef15624..cd32e228 100644
--- a/examples/04_tile_iterator/CMakeLists.txt
+++ b/examples/04_tile_iterator/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
index e6315760..5c56f33b 100644
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt
index 6c9bf504..6cd0ca8d 100644
--- a/examples/05_batched_gemm/CMakeLists.txt
+++ b/examples/05_batched_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu
index d1fecda6..a9d8a9c6 100644
--- a/examples/05_batched_gemm/batched_gemm.cu
+++ b/examples/05_batched_gemm/batched_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt
index 750c6205..7b30ae16 100644
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu
index 5fb513cb..f0e1d578 100644
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,7 +39,7 @@ inner product (1/16th of output), they accumulate to single output matrix.
 
 Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing
 high performance kernels at scale which works for multiple problem sizes with good abstractions is
-really hard. CUTLASS solves this problem by providing simplified abstractions (knobs) to compose
+really hard. CUTLASS solves this problem by providing simplified abstractions to compose
 multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU
 easily.
 
@@ -144,7 +144,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>;  // <- MMA Op tile M = 8, N = 8, K = 4
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
 
 // This code section describes ?
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -172,17 +172,7 @@ using Gemm = cutlass::gemm::device::GemmSplitKParallel<ElementInputA,
                                                        ShapeMMAOp,
                                                        EpilogueOp>;
 
-int main() {
-
-  //
-  // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1.
-  //
-  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
-  //
-  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
-    std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
-    return -1;
-  }
+int run() {
 
   cudaDeviceProp props;
 
@@ -316,11 +306,30 @@ int main() {
   tensor_ref_d.sync_host();
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
-  std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
-                                                       tensor_ref_d.host_view())
-                    ? "Passed"
-                    : "Failed")
-            << std::endl;
+  bool passed = cutlass::reference::host::TensorEquals(
+    tensor_d.host_view(),
+    tensor_ref_d.host_view());
 
-  CUTLASS_CHECK(status);
+  std::cout << (passed ? "Passed" : "Failed") << std::endl;
+
+  return (passed ? 0  : -1);
 }
+
+int main() {
+
+  //
+  // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1.
+  //
+  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  //
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+    std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+
+    // Returning zero, so this test passes when built with older CUDA Toolkits. Its action are no-op.
+    return 0;
+  }
+  else {
+    return run();
+  }
+}
+
diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt
index 56dfce9e..82e81722 100644
--- a/examples/07_volta_tensorop_gemm/CMakeLists.txt
+++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
index 447cc1cc..208c4f64 100644
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -156,7 +156,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>;  // <- MMA Op tile M = 8, N = 8, K = 4
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
 
 // This code section describes ?
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -188,15 +188,7 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
                                          SwizzleThreadBlock,
                                          NumStages>;
 
-int main() {
-
-  // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1.
-  //
-  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
-    std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
-    return -1;
-  }
+int run() {
 
   cudaDeviceProp props;
 
@@ -223,7 +215,7 @@ int main() {
   cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
       problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
-      problem_size.nk());  // <- Create matrix B with dimensions N x K
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
       problem_size.mn());  // <- Create matrix C with dimensions M x N
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
@@ -326,12 +318,28 @@ int main() {
   tensor_ref_d.sync_host();
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
-  std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
-                                                       tensor_ref_d.host_view())
-                    ? "Passed"
-                    : "Failed")
-            << std::endl;
+  bool passed = cutlass::reference::host::TensorEquals(
+    tensor_d.host_view(),
+    tensor_ref_d.host_view());
 
-  CUTLASS_CHECK(status);
-  return 0;
+  std::cout << (passed ? "Passed" : "Failed") << std::endl;
+
+  return (passed ? 0  : -1);
 }
+
+int main() {
+
+  // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1.
+  //
+  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+    std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+
+    // Returning zero when built on older Toolkits so tests pass. The actions of this SDK example are no-op.
+    return 0;
+  }
+  else {
+    return run();
+  }
+}
+
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index 9e011a1e..b4e4fe82 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index 3440d82f..d7ba8331 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -150,12 +150,12 @@ using SmArch = cutlass::arch::Sm75;
 using ShapeMMAThreadBlock =
     cutlass::gemm::GemmShape<128, 256, 64>;  // <- threadblock tile M = 128, N = 256, K = 64
 // This code section describes tile size a warp will compute
-using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>;  // <- warp tile M = 64, N = 64, K = 16
+using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>;  // <- warp tile M = 64, N = 64, K = 64 
 // This code section describes the size of MMA op
 using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 16>;  // <- MMA Op tile M = 8, N = 8, K = 16
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -186,7 +186,7 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
                                          SwizzleThreadBlock,
                                          NumStages>;
 
-int main() {
+int run() {
 
   // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
   // in CUDA 10.2. 
@@ -222,7 +222,7 @@ int main() {
   cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
       problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
-      problem_size.nk());  // <- Create matrix B with dimensions N x K
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
       problem_size.mn());  // <- Create matrix C with dimensions M x N
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
@@ -325,12 +325,28 @@ int main() {
   tensor_ref_d.sync_host();
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
-  std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
-                                                       tensor_ref_d.host_view())
-                    ? "Passed"
-                    : "Failed")
-            << std::endl;
+  bool passed = cutlass::reference::host::TensorEquals(
+    tensor_d.host_view(),
+    tensor_ref_d.host_view());
 
-  CUTLASS_CHECK(status);
-  return 0;
+  std::cout << (passed ? "Passed" : "Failed") << std::endl;
+
+  return (passed ? 0  : -1);
 }
+
+int main() {
+  // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
+  // in CUDA 10.2. 
+  //
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+
+    // Returning zero so this test passes when built on older Toolkits. 
+    return 0;
+  }
+  else {
+    return run();
+  }
+}
+
diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
index 7fc92870..b7318b99 100644
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@@ -500,7 +500,9 @@ int main(int argc, char const **args) {
   if (props.major < 7) {
     std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70."
               << std::endl;
-    return -1;
+
+    // Returning zero so this test passes on older architectures even though its actions are no-op.
+    return 0;
   }
   else if (props.major == 7 && props.minor <= 2) {
     //
@@ -508,7 +510,9 @@ int main(int argc, char const **args) {
     //
     if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
       std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
-      return -1;
+
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
     }
   }
   else if (props.major == 7 && props.minor >= 5) {
@@ -517,7 +521,9 @@ int main(int argc, char const **args) {
     //
     if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
       std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-      return -1;
+    
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
     }
   }
 
diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
index 3003a900..6a027053 100644
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -560,7 +560,9 @@ int main(int argc, char const **args) {
   if (props.major < 7) {
     std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
               << std::endl;
-    return -1;
+
+    // Returning zero so this passes on older architectures. Its actions are no-op.
+    return 0;
   }
   else if (props.major == 7 && props.minor <= 2) {
     //
@@ -568,7 +570,9 @@ int main(int argc, char const **args) {
     //
     if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
       std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
-      return -1;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
     }
   }
   else if (props.major == 7 && props.minor >= 5) {
@@ -577,7 +581,9 @@ int main(int argc, char const **args) {
     //
     if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
       std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-      return -1;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
     }
   }
 
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
new file mode 100644
index 00000000..fb78d77f
--- /dev/null
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  12_gemm_bias_relu
+  gemm_bias_relu.cu
+  )
+
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
new file mode 100644
index 00000000..7faaa98a
--- /dev/null
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -0,0 +1,282 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+*/
+
+#include <algorithm>
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "helper.h"
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = float;                   // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+using ElementInputA = cutlass::half_t;              // <- data type of elements in input matrix A
+using ElementInputB = cutlass::half_t;              // <- data type of elements in input matrix B
+using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = cutlass::layout::ColumnMajor;
+using LayoutInputB = cutlass::layout::ColumnMajor;
+using LayoutOutput = cutlass::layout::RowMajor;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm75;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<128, 128, 32>;  // <- threadblock tile M = 128, N = 128, K = 32
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M = 64, N = 64, K = 32 
+// This code section describes the size of MMA op
+using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 8, N = 8, K = 4
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+
+// Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
+//
+//    d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij )
+//
+using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
+    ElementOutput,                                     // <- data type of output matrix
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- this is the number of elements per
+                                                       // vectorized memory access. For half
+                                                       // precision, it's 8 elements. This becomes
+                                                       // the vector width of math instructions in
+                                                       // epilogue too
+    ElementAccumulator,                                // <- data type of accumulator
+    ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+// Number of pipelines you want to use
+constexpr int NumStages = 2;
+
+using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
+                                         LayoutInputA,
+                                         ElementInputB,
+                                         LayoutInputB,
+                                         ElementOutput,
+                                         LayoutOutput,
+                                         ElementAccumulator,
+                                         MMAOp,
+                                         SmArch,
+                                         ShapeMMAThreadBlock,
+                                         ShapeMMAWarp,
+                                         ShapeMMAOp,
+                                         EpilogueOp,
+                                         SwizzleThreadBlock,
+                                         NumStages>;
+
+int run() {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  const int length_m = 5120;
+  const int length_n = 4096;
+  const int length_k = 4096;
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      problem_size.nk());  // <- Create matrix B with dimensions N x K
+
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c_bias(
+      {problem_size.m(), 1});  // <- Create matrix C with dimensions M x 1
+
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      0);  // <- Fill matrix A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(4),
+      ElementInputB(-4),
+      0);  // <- Fill matrix B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c_bias.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c_bias.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{
+    problem_size,                       // <- problem size of matrix multiplication
+    tensor_a.device_ref(),              // <- reference to matrix A on device
+    tensor_b.device_ref(),              // <- reference to matrix B on device
+
+    {tensor_c_bias.device_data(), 0},   // <- the C matrix is treated as the bias vector. We can enable the GEMM
+                                        //    to project away the N dimension by setting the stride to zero.
+
+    tensor_d.device_ref(),              // <- reference to matrix D on device
+    {alpha, beta},                      // <- tuple of alpha and beta
+    split_k_slices};                    // <- k-dimension split factor
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm_op;
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // Launch initialized CUTLASS kernel
+  status = gemm_op();
+  CUTLASS_CHECK(status);
+
+  //
+  // Create instantiation for device reference gemm kernel
+  //
+
+  cutlass::reference::device::Gemm<ElementInputA,
+                                   LayoutInputA,
+                                   ElementInputB,
+                                   LayoutInputB,
+                                   ElementOutput,
+                                   LayoutOutput,
+                                   ElementComputeEpilogue,
+                                   ElementComputeEpilogue>
+      gemm_device_reference;
+
+  // Launch device reference to compute strictly the product A * B
+  gemm_device_reference(
+    problem_size,
+    alpha,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    0,
+    tensor_c_bias.device_ref(),
+    tensor_ref_d.device_ref());
+
+  // Wait for kernels to finish
+  cudaDeviceSynchronize();
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  tensor_d.sync_host();
+  tensor_ref_d.sync_host();
+
+  // Compute bias + relu in host code
+  for (int i = 0; i < problem_size.m(); ++i) {
+    for (int j = 0; j < problem_size.n(); ++j) {
+      tensor_ref_d.at({i, j}) = std::max(
+        ElementOutput(0), 
+        ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0}))
+      );
+    }
+  }
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
+                                                       tensor_ref_d.host_view())
+                    ? "Passed"
+                    : "Failed")
+            << std::endl;
+
+  CUTLASS_CHECK(status);
+  return 0;
+}
+
+int main() {
+  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+  else {
+    return run();
+  }
+}
+  
diff --git a/examples/13_fused_two_gemms/CMakeLists.txt b/examples/13_fused_two_gemms/CMakeLists.txt
new file mode 100644
index 00000000..ba51537c
--- /dev/null
+++ b/examples/13_fused_two_gemms/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  13_fused_two_gemms
+  fused_gemm.cu
+  )
+
+target_include_directories(
+  13_fused_two_gemms
+  PRIVATE
+  .
+  )
+
diff --git a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h b/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
new file mode 100644
index 00000000..10a0d4bf
--- /dev/null
+++ b/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
@@ -0,0 +1,190 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_gemm_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+void run_nonfused_gemm_f16() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
+  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+
+  B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
+  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+}
+
+void run_fused_gemm_f16() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
+  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+
+  B2bFusedGemmRun<B2bGemm> fusedGemm;
+
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
+  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
diff --git a/examples/13_fused_two_gemms/b2b_gemm_run.h b/examples/13_fused_two_gemms/b2b_gemm_run.h
new file mode 100644
index 00000000..053064d7
--- /dev/null
+++ b/examples/13_fused_two_gemms/b2b_gemm_run.h
@@ -0,0 +1,608 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm0_, typename Gemm1_>
+struct B2bNonFusedGemmRun
+{
+
+  using Gemm0 = Gemm0_;
+  using Gemm1 = Gemm1_;
+  using ElementAccumulator = typename Gemm0::ElementAccumulator;
+  using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  B2bNonFusedGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size_0, 
+    cutlass::gemm::GemmCoord problem_size_1, 
+    ElementCompute alpha0 = ElementCompute(1), 
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementA, 
+      typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementB, 
+      typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> reference_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementB, 
+      typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> reference_D1(problem_size_1.mn());
+
+
+    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
+    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
+    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
+    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
+    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
+
+    cutlass::reference::host::TensorFill(
+      tensor_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      tensor_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D1.host_view());
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1.sync_device();
+    reference_D0.sync_device();
+    reference_D1.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm0::Arguments arguments_0{
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0.device_ref(),
+      {alpha0, beta0}
+    };
+
+    typename Gemm1::Arguments arguments_1{
+      problem_size_1,
+      tensor_D0.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1.device_ref(),
+      {alpha1, beta1}
+    };
+
+
+    Gemm0 gemm_op_0;
+    Gemm1 gemm_op_1;
+
+    cutlass::Status status = gemm_op_0.initialize(arguments_0);
+
+    CUTLASS_CHECK(status);
+
+    status = gemm_op_1.initialize(arguments_1);
+
+    CUTLASS_CHECK(status);
+    //
+    // Run the GEMM
+    //
+
+    cudaEvent_t start, stop1, stop2;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop1);
+    cudaEventCreate(&stop2);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < 100; i++) {
+        status = gemm_op_0();
+    
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop1);
+    for(int i = 0; i < 100; i++) {
+    
+        status = gemm_op_1();
+    
+        CUTLASS_CHECK(status);
+    }
+
+    cudaEventRecord(stop2);
+    cudaDeviceSynchronize();
+    float gemm0Time, gemm1Time, totalTime;
+    cudaEventElapsedTime(&gemm0Time, start, stop1);
+    cudaEventElapsedTime(&gemm1Time, stop1, stop2);
+    cudaEventElapsedTime(&totalTime, start, stop2);
+    std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
+    std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
+    std::cout << "total time " << totalTime / 100.0 << " ms\n";
+
+    tensor_D0.sync_host();
+    tensor_D1.sync_host();
+
+    //
+    // Verify
+    //
+    cutlass::reference::device::Gemm<
+        typename Gemm0::ElementA, typename Gemm0::LayoutA,
+        typename Gemm0::ElementB, typename Gemm0::LayoutB,
+        typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm0::Operator>
+        reference_gemm_0;
+
+    cutlass::reference::device::Gemm<
+        typename Gemm1::ElementA, typename Gemm1::LayoutA,
+        typename Gemm1::ElementB, typename Gemm1::LayoutB,
+        typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm1::Operator>
+        reference_gemm_1;
+
+    reference_gemm_0(
+      problem_size_0,
+      alpha0, 
+      tensor_A0.device_ref(), 
+      tensor_B0.device_ref(), 
+      beta0, 
+      tensor_C0.device_ref(),
+      reference_D0.device_ref()
+    );
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+    }
+
+    reference_gemm_1(
+      problem_size_1,
+      alpha1, 
+      reference_D0.device_ref(), 
+      tensor_B1.device_ref(), 
+      beta1,
+      tensor_C1.device_ref(),
+      reference_D1.device_ref()
+    );
+    
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+    }
+   
+    // Wait for kernels to finish
+    cudaDeviceSynchronize();
+    reference_D0.sync_host();
+    reference_D1.sync_host();
+
+
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D1.host_view(), 
+      tensor_D1.host_view());
+
+    CHECK_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_B2bGemm_device_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A0 =\n" << tensor_A0.host_view()
+        << "\nB0 =\n" << tensor_B0.host_view()
+        << "\nC0 =\n" << tensor_C0.host_view()
+        << "\nD0 =\n" << tensor_D0.host_view()
+        << "\nB1 =\n" << tensor_B1.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\n\nReference =\n" << reference_D1.host_view()
+        << "\nComputed =\n" << tensor_D1.host_view();
+    }
+
+    return passed;
+  }
+};
+
+template <typename B2bGemm_>
+struct B2bFusedGemmRun
+{
+
+  using B2bGemm = B2bGemm_;
+  using ElementAccumulator = typename B2bGemm::ElementAccumulator;
+  using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  B2bFusedGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size_0, 
+    cutlass::gemm::GemmCoord problem_size_1, 
+    ElementCompute alpha0 = ElementCompute(1), 
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementA, 
+      typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementB, 
+      typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
+
+//    cutlass::HostTensor<
+//      typename B2bGemm::ElementC, 
+//      typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementB, 
+      typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn());
+
+
+    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
+    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
+    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
+    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
+    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
+
+    cutlass::reference::host::TensorFill(
+      tensor_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D0.host_view()); 
+    cutlass::reference::host::TensorFill(
+      reference_D1.host_view());
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1.sync_device();
+    reference_D0.sync_device();
+    reference_D1.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename B2bGemm::Arguments arguments{
+      problem_size_0,
+      problem_size_1,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+    };
+
+    B2bGemm b2b_gemm_op;
+
+    cutlass::Status status = b2b_gemm_op.initialize(arguments);
+
+    CUTLASS_CHECK(status);
+
+    //
+    // Run the GEMM
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < 100; i++) {
+        status = b2b_gemm_op();
+
+        CUTLASS_CHECK(status);
+    }
+
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float gemmTime;
+    cudaEventElapsedTime(&gemmTime, start, stop);
+    std::cout << "time " << gemmTime / 100.0 << " ms\n";
+
+    //tensor_D0.sync_host();
+    tensor_D1.sync_host();
+
+    //
+    // Verify
+    //
+    cutlass::reference::device::Gemm<
+        typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+        typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+        typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename B2bGemm::Operator>
+        reference_gemm_0, reference_gemm_1;
+
+    reference_gemm_0(
+      problem_size_0,
+      alpha0, 
+      tensor_A0.device_ref(), 
+      tensor_B0.device_ref(), 
+      beta0,
+      tensor_C0.device_ref(),
+      reference_D0.device_ref()
+    );
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+    }
+
+    reference_gemm_1(
+      problem_size_1,
+      alpha1, 
+      reference_D0.device_ref(), 
+      tensor_B1.device_ref(), 
+      beta1, 
+      tensor_C1.device_ref(),
+      reference_D1.device_ref()
+    );
+  
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+    }
+
+    cudaDeviceSynchronize();
+    reference_D0.sync_host();
+    reference_D1.sync_host();
+ 
+
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D1.host_view(), 
+      tensor_D1.host_view());
+
+    CHECK_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_B2bGemm_device_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A0 =\n" << tensor_A0.host_view()
+        << "\nB0 =\n" << tensor_B0.host_view()
+        << "\nC0 =\n" << tensor_C0.host_view()
+//        << "\nD0 =\n" << tensor_D0.host_view()
+        << "\nB1 =\n" << tensor_B1.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\n\nReference =\n" << reference_D1.host_view()
+        << "\nComputed =\n" << tensor_D1.host_view();
+    }
+
+    return passed;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h b/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
new file mode 100644
index 00000000..1c3f15c2
--- /dev/null
+++ b/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
@@ -0,0 +1,190 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_interleaved_gemm_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+void run_nonfused_gemm_s8() {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
+  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+
+  B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
+  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+}
+
+void run_fused_gemm_s8() {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
+  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+
+  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
+
+  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
+  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
diff --git a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h b/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h
new file mode 100644
index 00000000..906cabb4
--- /dev/null
+++ b/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h
@@ -0,0 +1,633 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+template <typename Gemm0_, typename Gemm1_, int InterleavedK_>
+struct B2bInterleavedNonFusedGemmRun
+{
+
+  using Gemm0 = Gemm0_;
+  using Gemm1 = Gemm1_;
+  using ElementAccumulator = typename Gemm0::ElementAccumulator;
+  using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  B2bInterleavedNonFusedGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size_0, 
+    cutlass::gemm::GemmCoord problem_size_1, 
+    ElementCompute alpha0 = ElementCompute(1), 
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementA, 
+      typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementB, 
+      typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementB, 
+      typename Gemm0::LayoutB> tensor_B0_reordered(problem_size_0.kn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> reference_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementB, 
+      typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementB, 
+      typename Gemm1::LayoutB> tensor_B1_reordered(problem_size_1.kn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> reference_D1(problem_size_1.mn());
+
+
+    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
+    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
+    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
+    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
+    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
+
+    //Reorder B0 and B1
+    cutlass::reorder_column<InterleavedK_>(
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0);
+    cutlass::reorder_column<InterleavedK_>(
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1);
+
+    cutlass::reference::host::TensorFill(
+      tensor_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      tensor_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D1.host_view());
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_B0_reordered.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0.sync_device();
+    tensor_B1.sync_device();
+    tensor_B1_reordered.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1.sync_device();
+    reference_D0.sync_device();
+    reference_D1.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm0::Arguments arguments_0{
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0_reordered.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0.device_ref(),
+      {alpha0, beta0}
+    };
+
+    typename Gemm1::Arguments arguments_1{
+      problem_size_1,
+      tensor_D0.device_ref(),
+      tensor_B1_reordered.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1.device_ref(),
+      {alpha1, beta1}
+    };
+
+
+    Gemm0 gemm_op_0;
+    Gemm1 gemm_op_1;
+
+    cutlass::Status status = gemm_op_0.initialize(arguments_0);
+
+    CUTLASS_CHECK(status);
+
+    status = gemm_op_1.initialize(arguments_1);
+
+    CUTLASS_CHECK(status);
+    //
+    // Run the GEMM
+    //
+    cudaEvent_t start, stop1, stop2;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop1);
+    cudaEventCreate(&stop2);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < 100; i++) {
+        status = gemm_op_0();
+    
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop1);    
+
+    for(int i = 0; i < 100; i++) {
+        status = gemm_op_1();
+    
+        CUTLASS_CHECK(status);
+    }
+
+    cudaEventRecord(stop2);
+    cudaDeviceSynchronize();
+    float gemm0Time, gemm1Time, totalTime;
+    cudaEventElapsedTime(&gemm0Time, start, stop1);
+    cudaEventElapsedTime(&gemm1Time, stop1, stop2);
+    cudaEventElapsedTime(&totalTime, start, stop2);
+    std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
+    std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
+    std::cout << "total time " << totalTime / 100.0 << " ms\n";
+
+    tensor_D0.sync_host();
+    tensor_D1.sync_host();
+
+    //
+    // Verify
+    //
+    cutlass::reference::device::Gemm<
+        typename Gemm0::ElementA, typename Gemm0::LayoutA,
+        typename Gemm0::ElementB, typename Gemm0::LayoutB,
+        typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm0::Operator>
+        reference_gemm_0;
+
+    cutlass::reference::device::Gemm<
+        typename Gemm1::ElementA, typename Gemm1::LayoutA,
+        typename Gemm1::ElementB, typename Gemm1::LayoutB,
+        typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm1::Operator>
+        reference_gemm_1;
+
+    reference_gemm_0(
+      problem_size_0,
+      alpha0, 
+      tensor_A0.device_ref(), 
+      tensor_B0.device_ref(), 
+      beta0, 
+      tensor_C0.device_ref(), 
+      reference_D0.device_ref()
+    );
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+    }
+
+    reference_gemm_1(
+      problem_size_1,
+      alpha1, 
+      tensor_D0.device_ref(), 
+      tensor_B1.device_ref(), 
+      beta1, 
+      tensor_C1.device_ref(), 
+      reference_D1.device_ref()
+    );
+  
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+    }
+
+    cudaDeviceSynchronize();
+    reference_D0.sync_host(); 
+    reference_D1.sync_host(); 
+
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D1.host_view(), 
+      tensor_D1.host_view());
+
+    CHECK_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_B2bGemm_device_interleaved_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A0 =\n" << tensor_A0.host_view()
+        << "\nB0 =\n" << tensor_B0.host_view()
+        << "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
+        << "\nC0 =\n" << tensor_C0.host_view()
+        << "\nD0 =\n" << tensor_D0.host_view()
+        << "\nB1 =\n" << tensor_B1.host_view()
+        << "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\n\nReference =\n" << reference_D1.host_view()
+        << "\nComputed =\n" << tensor_D1.host_view();
+    }
+
+    return passed;
+  }
+};
+
+template <typename B2bGemm_, int InterleavedK_>
+struct B2bInterleavedFusedGemmRun
+{
+
+  using B2bGemm = B2bGemm_;
+  using ElementAccumulator = typename B2bGemm::ElementAccumulator;
+  using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  B2bInterleavedFusedGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size_0, 
+    cutlass::gemm::GemmCoord problem_size_1, 
+    ElementCompute alpha0 = ElementCompute(1), 
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute beta1 = ElementCompute(0), 
+    bool relu = true) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementA, 
+      typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementB, 
+      typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementB, 
+      typename B2bGemm::LayoutB> tensor_B0_reordered(problem_size_0.kn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
+
+//    cutlass::HostTensor<
+//      typename B2bGemm::ElementC, 
+//      typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementB, 
+      typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementB, 
+      typename B2bGemm::LayoutB> tensor_B1_reordered(problem_size_1.kn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn());
+
+    cutlass::HostTensor<
+      typename B2bGemm::ElementC, 
+      typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn());
+
+
+    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
+    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
+    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
+    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
+    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
+
+    //Reorder B0
+    cutlass::reorder_column<B2bGemm::InstructionShape::kK>(
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0);
+    cutlass::reorder_column<InterleavedK_>(
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1);
+
+    cutlass::reference::host::TensorFill(
+      tensor_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D1.host_view());
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_B0_reordered.sync_device();
+    tensor_C0.sync_device();
+    //tensor_D0.sync_device();
+    tensor_B1.sync_device();
+    tensor_B1_reordered.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1.sync_device();
+    reference_D0.sync_device();
+    reference_D1.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename B2bGemm::Arguments arguments{
+      problem_size_0,
+      problem_size_1,
+      tensor_A0.device_ref(),
+      tensor_B0_reordered.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_B1_reordered.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      1, /*threadblock_swizzle_k_tile*/
+    };
+
+    B2bGemm b2b_gemm_op;
+
+    cutlass::Status status = b2b_gemm_op.initialize(arguments);
+
+    CUTLASS_CHECK(status);
+
+    //
+    // Run the GEMM
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < 100; i++) {
+        status = b2b_gemm_op();
+
+        CUTLASS_CHECK(status);
+    }
+
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float gemmTime;
+    cudaEventElapsedTime(&gemmTime, start, stop);
+    std::cout << "time " << gemmTime / 100.0 << " ms\n";
+
+    //tensor_D0.sync_host();
+    tensor_D1.sync_host();
+
+    //
+    // Verify
+    //
+    cutlass::reference::device::Gemm<
+        typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+        typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+        typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename B2bGemm::Operator>
+        reference_gemm_0, reference_gemm_1;
+
+    reference_gemm_0(
+      problem_size_0,
+      alpha0, 
+      tensor_A0.device_ref(), 
+      tensor_B0.device_ref(), 
+      beta0, 
+      tensor_C0.device_ref(), 
+      reference_D0.device_ref()
+    );
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+    }
+
+    reference_gemm_1(
+      problem_size_1,
+      alpha1, 
+      reference_D0.device_ref(), 
+      tensor_B1.device_ref(), 
+      beta1, 
+      tensor_C1.device_ref(), 
+      reference_D1.device_ref()
+    );
+
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+    }
+  
+    cudaDeviceSynchronize();
+    reference_D0.sync_host(); 
+    reference_D1.sync_host(); 
+
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D1.host_view(), 
+      tensor_D1.host_view());
+
+    CHECK_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_B2bGemm_device_interleaved_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A0 =\n" << tensor_A0.host_view()
+        << "\nB0 =\n" << tensor_B0.host_view()
+        << "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
+        << "\nC0 =\n" << tensor_C0.host_view()
+//        << "\nD0 =\n" << tensor_D0.host_view()
+        << "\nB1 =\n" << tensor_B1.host_view()
+        << "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\n\nReference =\n" << reference_D1.host_view()
+        << "\nComputed =\n" << tensor_D1.host_view();
+    }
+
+    return passed;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/device/b2b_gemm.h b/examples/13_fused_two_gemms/device/b2b_gemm.h
new file mode 100644
index 00000000..3f161435
--- /dev/null
+++ b/examples/13_fused_two_gemms/device/b2b_gemm.h
@@ -0,0 +1,439 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+
+#include "kernel/b2b_gemm.h"
+#include "kernel/default_b2b_gemm.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Whether Beta is zero or not
+    bool IsBetaZero = false>
+class B2bGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape0 = ThreadblockShape0_;
+  using ThreadblockShape1 = ThreadblockShape1_;
+  using WarpShape0 = WarpShape0_;
+  using WarpShape1 = WarpShape1_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp0 = EpilogueOutputOp0_;
+  using EpilogueOutputOp1 = EpilogueOutputOp1_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp1::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static bool const kIsBetaZero = IsBetaZero;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using B2bGemmKernel = typename kernel::DefaultB2bGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kIsBetaZero
+  >::B2bGemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size_0;
+    GemmCoord problem_size_1;
+    TensorRef<ElementA const, LayoutA> ref_A0;
+    TensorRef<ElementB const, LayoutB> ref_B0;
+    TensorRef<ElementC const, LayoutC> ref_C0;
+    TensorRef<ElementB const, LayoutB> ref_B1;
+    TensorRef<ElementC const, LayoutC> ref_C1;
+    TensorRef<ElementC, LayoutC> ref_D1;
+    typename EpilogueOutputOp0::Params epilogue0;
+    typename EpilogueOutputOp1::Params epilogue1;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size_0(0, 0, 0), problem_size_1(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_0_,
+      GemmCoord problem_size_1_,
+      TensorRef<ElementA const, LayoutA> ref_A0_,
+      TensorRef<ElementB const, LayoutB> ref_B0_,
+      TensorRef<ElementC const, LayoutC> ref_C0_,
+      TensorRef<ElementB const, LayoutB> ref_B1_,
+      TensorRef<ElementC const, LayoutC> ref_C1_,
+      TensorRef<ElementC, LayoutC> ref_D1_,
+      typename EpilogueOutputOp0::Params epilogue0_ = 
+        typename EpilogueOutputOp0::Params(),
+      typename EpilogueOutputOp1::Params epilogue1_ = 
+        typename EpilogueOutputOp1::Params(),
+      int split_k_slices_ = 1
+    ):
+      problem_size_0(problem_size_0_),
+      problem_size_1(problem_size_1_),
+      ref_A0(ref_A0_),
+      ref_B0(ref_B0_),
+      ref_C0(ref_C0_),
+      ref_B1(ref_B1_),
+      ref_C1(ref_C1_),
+      ref_D1(ref_D1_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_),
+      split_k_slices(split_k_slices_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename B2bGemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  B2bGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = B2bGemmKernel::can_implement(
+      args.problem_size_0,
+      args.problem_size_1,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+      
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size_0, 
+      {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size_0, 
+      {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+      args.split_k_slices);
+//    cutlass::gemm::GemmCoord grid_shape_1 = threadblock_swizzle.get_tiled_shape(
+//      args.problem_size_1, 
+//      {ThreadblockShape1::kM, ThreadblockShape1::kN, ThreadblockShape1::kK},
+//      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename B2bGemmKernel::Params{
+      args.problem_size_0,
+      args.problem_size_1,
+      grid_shape,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1,
+      args.epilogue0,
+      args.epilogue1,
+      static_cast<int *>(workspace),
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A0.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B0.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C0.reset(args.ref_C.non_const_ref().data());
+    params_.ref_B1.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C1.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D1.reset(args.ref_D.data());
+    params_.output_op_0 = args.epilogue0;
+    params_.output_op_1 = args.epilogue1;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(B2bGemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename B2bGemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<B2bGemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+
+      result = cudaFuncSetAttribute(
+          Kernel<B2bGemmKernel>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<B2bGemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_fused_two_gemms/fused_gemm.cu
new file mode 100644
index 00000000..8f5d4f2c
--- /dev/null
+++ b/examples/13_fused_two_gemms/fused_gemm.cu
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+*/
+
+#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h"
+#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
+
+int run() {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+  run_nonfused_gemm_f16();
+  run_fused_gemm_f16();
+  run_nonfused_gemm_s8();
+  run_fused_gemm_s8();
+#endif
+
+  return 0;
+}
+
+int main() {
+  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+  else {
+    return run();
+  }
+}
+
diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_fused_two_gemms/kernel/b2b_gemm.h
new file mode 100644
index 00000000..d106fa46
--- /dev/null
+++ b/examples/13_fused_two_gemms/kernel/b2b_gemm.h
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct B2bGemm {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using OutputOp0 = typename B2bMma::OutputOp;
+  using OutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size_0;
+    cutlass::gemm::GemmCoord problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    typename B2bMma::IteratorA0::Params params_A0;
+    typename B2bMma::IteratorA0::TensorRef ref_A0;
+    typename B2bMma::IteratorB0::Params params_B0;
+    typename B2bMma::IteratorB0::TensorRef ref_B0;
+    typename Epilogue::OutputTileIterator::Params params_C0;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C0;
+    typename B2bMma::IteratorB1::Params params_B1;
+    typename B2bMma::IteratorB1::TensorRef ref_B1;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1;
+    typename Epilogue::OutputTileIterator::Params params_D1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1;
+    typename OutputOp0::Params output_op_0;
+    typename OutputOp1::Params output_op_1;
+    int *semaphore;
+    int gemm_k_iterations_0;
+    int gemm_k_size_0;
+    int gemm_k_iterations_1;
+    int gemm_k_size_1;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
+        gemm_k_iterations_1(0), gemm_k_size_1(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1,
+      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
+      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      grid_tiled_shape(grid_tiled_shape),
+      params_A0(ref_A0.layout()),
+      ref_A0(ref_A0),
+      params_B0(ref_B0.layout()),
+      ref_B0(ref_B0),
+      params_C0(ref_C0.layout()),
+      ref_C0(ref_C0),
+      params_B1(ref_B1.layout()),
+      ref_B1(ref_B1),
+      params_C1(ref_C1.layout()),
+      ref_C1(ref_C1),
+      params_D1(ref_D1.layout()),
+      ref_D1(ref_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1) {
+
+      int total_gemm_k_iterations_0 = (problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK;
+      int gemm_k_iterations_0 = (total_gemm_k_iterations_0 + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+      gemm_k_size_0 = gemm_k_iterations_0 * B2bMma::Shape0::kK;
+      int total_gemm_k_iterations_1 = (problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK;
+      int gemm_k_iterations_1 = (total_gemm_k_iterations_1 + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+      gemm_k_size_1 = gemm_k_iterations_1 * B2bMma::Shape1::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  B2bGemm() { } 
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1) {
+
+    static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements;
+    static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) ||
+      (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) ||
+      (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) ||
+      (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) ||
+      (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) ||
+      (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A0{
+      threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size_0,
+    };
+
+    cutlass::MatrixCoord tb_offset_B0{
+      threadblock_tile_offset.k() * params.gemm_k_size_0,
+      threadblock_tile_offset.n() * B2bMma::Shape0::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_B1{
+      threadblock_tile_offset.k() * params.gemm_k_size_1,
+      threadblock_tile_offset.n() * B2bMma::Shape1::kN
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k_0 = min(
+      params.problem_size_0.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size_0);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations_0 = (problem_size_k_0 - tb_offset_A0.column() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK;
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k_1 = min(
+      params.problem_size_1.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size_1);
+
+    // Compute threadblock-scoped matrix multiply-add
+//    int gemm_k_iterations_1 = (problem_size_k_1 - tb_offset_B1.row() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK;
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename B2bMma::IteratorA0 iterator_A0(
+      params.params_A0,
+      params.ref_A0.data(),
+      {params.problem_size_0.m(), problem_size_k_0},
+      thread_idx,
+      tb_offset_A0);
+
+    typename B2bMma::IteratorB0 iterator_B0(
+      params.params_B0,
+      params.ref_B0.data(),
+      {problem_size_k_0, params.problem_size_0.n()},
+      thread_idx,
+      tb_offset_B0);
+
+    typename B2bMma::IteratorB1 iterator_B1(
+      params.params_B1,
+      params.ref_B1.data(),
+      {problem_size_k_1, params.problem_size_1.n()},
+      thread_idx,
+      tb_offset_B1);
+
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    OutputOp0 output_op_0(params.output_op_0);
+
+    // Construct thread-scoped matrix multiply
+    B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename B2bMma::FragmentC0 src_accum;
+    typename B2bMma::FragmentC1 accumulators;
+
+    src_accum.clear();
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations_0 > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp1 output_op_1(params.output_op_1);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset();
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * B2bMma::Shape1::kM,
+      threadblock_tile_offset.n() * B2bMma::Shape1::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op_1.set_k_partition(threadblock_tile_offset.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C1(
+      params.params_C1,
+      params.ref_C1.data(),
+      params.problem_size_1.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D1(
+      params.params_D1,
+      params.ref_D1.data(),
+      params.problem_size_1.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C1 = iterator_D1;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op_1, iterator_D1, accumulators, iterator_C1); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h b/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h
new file mode 100644
index 00000000..45b2d545
--- /dev/null
+++ b/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h
@@ -0,0 +1,296 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ *modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its
+ *contributors may be used to endorse or promote products derived from this
+ *software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
+ *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
+ *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "kernel/b2b_gemm.h"
+#include "threadblock/default_b2b_mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Beta is zero or not
+  bool IsBetaZero = false
+>
+struct DefaultB2bGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator
+>
+struct DefaultB2bGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    2,
+    Operator,
+    EpilogueOutputOp0
+  >::ThreadblockB2bMma;
+  
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    typename B2bMma::Operator1,
+    kPartitionsK1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+
+/// Partial specialization for Turing IMMA Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Is Beta zero or not
+    bool IsBetaZero>
+struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, 
+                   ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+                   InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape0, ThreadblockShape1, 
+      WarpShape0, WarpShape1, InstructionShape, 2, Operator, EpilogueOutputOp0, true>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue for the 2nd Gemm
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          64 / sizeof_bits<ElementC>::value, InterleavedK,
+          IsBetaZero>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h b/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h
new file mode 100644
index 00000000..01cca8b7
--- /dev/null
+++ b/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  using Shape1 = Shape1_;
+
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm0 = typename Policy0::Operator::Shape;
+  using WarpGemm1 = typename Policy1::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount0 = GemmShape<Shape0::kM / WarpGemm0::kM,
+                               Shape0::kN / WarpGemm0::kN,
+                               Shape0::kK / WarpGemm0::kK>;
+  using WarpCount1 = GemmShape<Shape1::kM / WarpGemm1::kM,
+                               Shape1::kN / WarpGemm1::kN,
+                               Shape1::kK / WarpGemm1::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations0 =
+      (WarpGemm0::kK / Operator0::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 =
+      (WarpGemm1::kK / Operator1::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template<
+    typename Shape_,
+    typename Policy_
+  >
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Policy = Policy_;
+    using Operator = typename Policy::Operator;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+  
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+  using SharedStorage0 = SharedStorage<Shape0, Policy0>;
+  using SharedStorage1 = SharedStorage<Shape1, Policy1>;
+  union B2bMmaSharedStorage {
+    SharedStorage0 sharedStorage0;
+    SharedStorage1 sharedStorage1;
+  };
+
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A0 operand from shared memory
+  typename Operator0::IteratorA warp_tile_iterator_A0_;
+
+  /// Iterator to load a warp-scoped tile of B0 operand from shared memory
+  typename Operator0::IteratorB warp_tile_iterator_B0_;
+
+  /// Iterator to load a warp-scoped tile of B0 operand from shared memory
+  typename Operator1::IteratorB warp_tile_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), lane_idx),
+      warp_tile_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h b/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h
new file mode 100644
index 00000000..ca89cf0b
--- /dev/null
+++ b/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+template<int a>
+struct chk_val {
+    static_assert(a==0, "check value");
+};
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator) 
+  typename FragmentIteratorA1_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy1_,
+  /// Transformation applied to A0 operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element, 
+    typename IteratorA0_::Element, 
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B0 operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element, 
+    typename IteratorB0_::Element, 
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B1 operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element, 
+    typename IteratorB1_::Element, 
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+  
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx), 
+    smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) {
+
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    //These should stay the same across different GEMM layers
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                            ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                  ///< destination accumulator tile
+    IteratorA0 iterator_A,                              ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                             ///< iterator over B0 operand in global memory
+    IteratorB1 iterator_B1,                             ///< iterator over B1 operand in global memory  
+    FragmentC0 const &src_accum,                        ///< source accumualtor tile
+    OutputOp output_op_0,                               ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),           ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {         ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(tb_frag_A);
+    this->smem_iterator_B0_.store(tb_frag_B0);
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    if (gemm_k_iterations_0 <= 1) {
+      iterator_A.clear_mask();
+      iterator_B0.clear_mask();
+    }
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+    iterator_A.load(tb_frag_A);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(tb_frag_A);
+
+          this->smem_iterator_B0_.store(tb_frag_B0);
+
+          __syncthreads();
+
+          // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+          // shared memory loads (which have the tighest latency requirement).
+          iterator_A.load(tb_frag_A);
+          
+          ++this->smem_iterator_B0_;
+          ++this->smem_iterator_A_;
+        
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B0.load(tb_frag_B0);
+
+          ++iterator_A;
+          ++iterator_B0;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          if (gemm_k_iterations_0 <= 2) {
+            iterator_A.clear_mask();
+            iterator_B0.clear_mask();
+          }
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0);
+      }
+    }
+
+    //2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentB1 tb_frag_B1;
+
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B1.load(tb_frag_B1);
+
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(tb_frag_B1);
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    //warp_tile_iterator_A1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Avoid reading out of bounds
+    if (gemm_k_iterations_1 <= 1) {
+      iterator_B1.clear_mask();
+    }
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          // Write fragments to shared memory
+
+          this->smem_iterator_B1_.store(tb_frag_B1);
+
+          __syncthreads();
+          ++smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+          ++iterator_B1;
+
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          if (gemm_k_iterations_1 <= 2) {
+            iterator_B1.clear_mask();
+          }
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h b/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h
new file mode 100644
index 00000000..cd1403c7
--- /dev/null
+++ b/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_pipelined.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultB2bMma;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  OperatorClass, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, EpilogueOutputOp, false> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB, 
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, 
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, EpilogueOutputOp, true> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator, 
+      true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore0::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore0::IteratorThreadMapB>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor; //AccumulatorsInRowMajor = true
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, 
+          InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>;
+
+
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB, 
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, 
+      ElementAccumulator, layout::ColumnMajorInterleaved<InterleavedK>,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d5c503e9..3da7ae45 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -60,6 +60,8 @@ foreach(EXAMPLE
   08_turing_tensorop_gemm
   10_planar_complex
   11_planar_complex_array
+  12_gemm_bias_relu
+  13_fused_two_gemms
 )
 
   add_subdirectory(${EXAMPLE})
diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h
index 3232ef87..8b3bb071 100644
--- a/include/cutlass/aligned_buffer.h
+++ b/include/cutlass/aligned_buffer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index b38a347a..faf01cc6 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -52,6 +52,10 @@ struct Sm72 {
 struct Sm75 {
   static int const kMinComputeCapability = 75;
 };
+struct Sm80 {
+  static int const kMinComputeCapability = 80; 
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace arch
diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h
new file mode 100644
index 00000000..646b51de
--- /dev/null
+++ b/include/cutlass/arch/cache_operation.h
@@ -0,0 +1,60 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Directives related to cache operations
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Controls PTX cache operations
+struct CacheOperation {
+  enum Kind {
+    /// Cache at all levels - accessed again
+    Always,
+    /// Cache at global level
+    Global,
+    /// Streaming - likely to be accessed once
+    Streaming,
+    /// Indicates the line will not be used again
+    LastUse,
+    /// Don't cache, and fetch again
+    Volatile,
+    /// Write back at all coherent levels
+    WriteBack,
+    /// Write through to system memory
+    WriteThrough
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace arch
+}  // namespace cutlass
diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
index fc939053..48ef02cd 100644
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -28,13 +28,271 @@
 
 #pragma once
 
+#include "cutlass/cutlass.h"
+
 namespace cutlass {
 namespace arch {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <
+    /// Fragment type to store loaded data
+    typename AccessType,
+    /// The bytes of loading
+    int LoadBytes
+    >
+struct global_load;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType
+         >
+struct global_load<AccessType,
+                   32 
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %9, 0;\n"
+      "  mov.b32 %0, %10;\n"
+      "  mov.b32 %1, %11;\n"
+      "  mov.b32 %2, %12;\n"
+      "  mov.b32 %3, %13;\n"
+      "  mov.b32 %4, %14;\n"
+      "  mov.b32 %5, %15;\n"
+      "  mov.b32 %6, %16;\n"
+      "  mov.b32 %7, %17;\n"
+      "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+      "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+      "}\n"
+      : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+        "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+      : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+        "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+        "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+  }
+};
+
+
+template <typename AccessType
+         >
+struct global_load<AccessType,
+                   16
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 &data = reinterpret_cast<uint4 &>(D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %5, 0;\n"
+      "  mov.b32 %0, %6;\n"
+      "  mov.b32 %1, %7;\n"
+      "  mov.b32 %2, %8;\n"
+      "  mov.b32 %3, %9;\n"
+      "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+      "}\n"
+      : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+      : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
+  }
+};
+
+template <typename AccessType
+         >
+struct global_load<AccessType,
+                   8
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint2 &data = reinterpret_cast<uint2 &>(D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %3, 0;\n"
+      "  mov.b32 %0, %4;\n"
+      "  mov.b32 %1, %5;\n"
+      "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+      "}\n"
+      : "=r"(data.x), "=r"(data.y)
+      : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+template <typename AccessType
+         >
+struct global_load<AccessType,
+                   4
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  unsigned &data = reinterpret_cast<unsigned &>(D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %2, 0;\n"
+      "  mov.b32 %0, %3;\n"
+      "  @p ld.global.u32 %0, [%1];\n"
+      "}\n"
+      : "=r"(data)
+      : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+template <typename AccessType
+         >
+struct global_load<AccessType,
+                   2
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint16_t &data = reinterpret_cast<uint16_t &>(D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %2, 0;\n"
+      "  mov.b16 %0, %3;\n"
+      "  @p ld.global.u16 %0, [%1];\n"
+      "}\n"
+      : "=h"(data)
+      : "l"(ptr), "r"((int)pred_guard), "h"(data));
+  }
+};
+
+template <typename AccessType
+          >
+struct global_load<AccessType,
+                   1
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    if (pred_guard) D = *(reinterpret_cast<AccessType const *>(ptr));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Fragment type to store loaded data
+    typename AccessType,
+    /// The bytes of loading
+    int LoadBytes
+    >
+struct global_store;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType>
+struct global_store<AccessType, 32> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint4 const *data = reinterpret_cast<uint4 const *>(&D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %5, 0;\n"
+      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+      "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+        "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16),
+        "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 16> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint4 const &data = reinterpret_cast<uint4 const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %5, 0;\n"
+      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 8> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint2 const &data = reinterpret_cast<uint2 const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %3, 0;\n"
+      "  @p st.global.v2.u32 [%0], {%1, %2};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 4> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint32_t const &data = reinterpret_cast<uint32_t const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %2, 0;\n"
+      "  @p st.global.u32 [%0], %1;\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 2> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint16_t const &data = reinterpret_cast<uint16_t const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %2, 0;\n"
+      "  @p st.global.u16 [%0], %1;\n"
+      "}\n"
+      :
+      : "l"(ptr), "h"(data), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 1> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+    if (pred_guard) *(reinterpret_cast<AccessType *>(ptr)) = D;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 
 } // namespace arch
 } // namespace cutlass
@@ -42,4 +300,6 @@ namespace arch {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "memory_sm75.h"  
+#include "memory_sm80.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h
index 195f8abf..3fd121b9 100644
--- a/include/cutlass/arch/memory_sm75.h
+++ b/include/cutlass/arch/memory_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -50,20 +50,20 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if ! defined(CUDA_LDMATRIX_SUPPORTED)
-  #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2))
+#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) || (__CUDACC_VER_MAJOR__ >= 11)
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
+#define CUDA_LDMATRIX_ACTIVATED 1
 #endif
 
-#if ! defined(CUDA_LDMATRIX_ENABLED)
-  #define CUDA_LDMATRIX_ENABLED CUDA_LDMATRIX_SUPPORTED
-#endif
-
-#if CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
-  #define CUDA_LDMATRIX_ACTIVATED 1
+#define CUDA_LDMATRIX_SUPPORTED 1
 #endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
+/*
+#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) && (__CUDACC_VER_MAJOR__ > 10)
+  #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED 1
+#endif
 #if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED)
   #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 1))
 #endif
@@ -71,8 +71,9 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 #if ! defined(CUDA_NVVM_GET_SMEM_POINTER_ENABLED)
   #define CUDA_NVVM_GET_SMEM_POINTER_ENABLED CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED
 #endif
+*/
 
-#if CUDA_NVVM_GET_SMEM_POINTER_ENABLED
+#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
   extern "C" {
   //
   // This NVVM intrinsic is subject to change in future versions of CUDA.
@@ -85,19 +86,49 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if CUDA_NVVM_GET_SMEM_POINTER_ENABLED
+/// CUTLASS helper to get SMEM pointer
+inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
+
+// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
+// the previous internal intrinsics if they are available.
+#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
+  //
+  // This NVVM intrinsic converts an address in shared memory to a plain
+  // unsigned integer. This is necessary to pass to shared memory instructions
+  // in inline PTX.
+  //
+  // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer()  [only available in 10.2].
+  //
+  //__device__ size_t __cvta_generic_to_shared(void* ptr);
 
   /// CUTLASS helper to get SMEM pointer
-  inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) {
-    return __nvvm_get_smem_pointer(const_cast<void *>(ptr));
-  }
+  return static_cast<unsigned>(__cvta_generic_to_shared(ptr));
 
-  /// CUTLASS helper to get SMEM pointer
-  inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
-    return __nvvm_get_smem_pointer(ptr);
-  }
+#elif (defined(__CUDA_ARCH__) &&  __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
 
+  return __nvvm_get_smem_pointer(ptr);
+
+#elif defined(__CUDA_ARCH__)
+
+  uint32_t smem_ptr;
+
+  asm(
+  "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n" 
+    : "=r"(smem_ptr) : "l"(ptr));
+
+  return smem_ptr;
+
+#else
+
+  return 0;
 #endif
+}
+  
+/// CUTLASS helper to get SMEM pointer
+inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) {
+  return cutlass_get_smem_pointer(const_cast<void *>(ptr));
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
@@ -235,5 +266,6 @@ inline __device__ void ldsm<layout::ColumnMajor, 4>(
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace arch
 } // namespace cutlass
diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
new file mode 100644
index 00000000..04c56876
--- /dev/null
+++ b/include/cutlass/arch/memory_sm80.h
@@ -0,0 +1,238 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Architecture-specific operators on memory added for SM80
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/cache_operation.h"
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  #define CUDA_CP_ASYNC_ACTIVATED 1
+#else
+  #define CUDA_CP_ASYNC_ACTIVATED 0
+#endif
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initiates an asynchronous copy from global memory to shared memory.
+///
+/// LDGSTS
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always>
+struct cp_async;
+
+/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
+/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
+///
+/// LDGSTS
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always>
+struct cp_async_zfill;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async<SizeInBytes, CacheOperation::Always> {
+  /// Copy
+  CUTLASS_DEVICE
+  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+    
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+          "  @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+          "}\n" ::"r"((int)pred_guard),
+          "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+    #endif
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+    
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      int src_in_bytes = (pred_guard ? SizeInBytes : 0);
+
+      asm volatile(
+        "cp.async.ca.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+        "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+      else {
+        AccessType zeros;
+        zeros.clear();
+        *static_cast<AccessType *>(smem_ptr) = zeros;
+      }
+    #endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async<SizeInBytes, CacheOperation::Global> {
+  /// Copy
+  CUTLASS_DEVICE
+  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+    
+      static_assert(SizeInBytes == 16, 
+        "cp.async only supports CacheOperation::Global when access size is 16B.");
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+          "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+          "}\n" ::"r"((int)pred_guard),
+          "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+    #endif
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      static_assert(SizeInBytes == 16, 
+        "cp.async only supports CacheOperation::Global when access size is 16B.");
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      int src_in_bytes = (pred_guard ? SizeInBytes : 0);
+
+      asm volatile(
+        "cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+        "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+      else {
+        AccessType zeros;
+        zeros.clear();
+        *static_cast<AccessType *>(smem_ptr) = zeros;
+      }
+    #endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+CUTLASS_DEVICE
+void cp_async_fence() {
+  #if CUDA_CP_ASYNC_ACTIVATED
+  asm volatile("cp.async.commit_group;\n" ::);
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
+template <int N>
+CUTLASS_DEVICE void cp_async_wait() {
+  #if CUDA_CP_ASYNC_ACTIVATED
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+  #endif
+}
+
+/// Blocks until all previous cp.async.commit_group operations have committed.
+template <>
+CUTLASS_DEVICE void cp_async_wait<0>() {
+  #if CUDA_CP_ASYNC_ACTIVATED
+  asm volatile("cp.async.wait_all;\n" ::);
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace arch
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index e59b710f..74c24695 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -51,11 +51,26 @@ struct OpMultiplyAddSaturate;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Tag indicating the input is converted to a narrower type (BF16)
+struct OpMultiplyAddFastBF16;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input is converted to a narrower type (F16)
+struct OpMultiplyAddFastF16;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Tag indicating the complex multiply-add operation
 struct OpMultiplyAddComplex;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Tag indicating the gaussian complex multiply-add operation
+struct OpMultiplyAddGaussianComplex;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Tag indicating the inner product is defined by (XOR, POPC)
 struct OpXorPopc;
 
diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h
index 8698a8b3..fce521dc 100644
--- a/include/cutlass/arch/mma_sm50.h
+++ b/include/cutlass/arch/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
index 6e513ced..ab0481ae 100644
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
index 68a1b145..9ec8857e 100644
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h
index 57b50e00..b03ce2c1 100644
--- a/include/cutlass/arch/mma_sm70.h
+++ b/include/cutlass/arch/mma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index fb8a3dc5..ef65f20b 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
new file mode 100644
index 00000000..445ec388
--- /dev/null
+++ b/include/cutlass/arch/mma_sm80.h
@@ -0,0 +1,2091 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+
+#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_MMA_SM80_ENABLED
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float BF16, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 4>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+  );
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1684 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 4>,
+  32,
+  tfloat32_t,
+  layout::RowMajor,
+  tfloat32_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 4>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 2>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 1>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+    uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+    uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, "
+        "{%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 884 - F64
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8,8,4>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 1>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 2>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint64_t const & A = reinterpret_cast<uint64_t const &>(a);
+  uint64_t const & B = reinterpret_cast<uint64_t const &>(b);
+
+  uint64_t const *C = reinterpret_cast<uint64_t const *>(&c);
+  uint64_t *D = reinterpret_cast<uint64_t *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=l"(D[0]), "=l"(D[1])
+      : "l"(A), "l"(B), "l"(C[0]), "l"(C[1]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816 - S8 input, S32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
+        "{%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
+        "{%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
+        "{%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
+        "{%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+    
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - S8 input, S32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16864 - S4 input, S32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h
index 75b38001..4520acc9 100644
--- a/include/cutlass/arch/simd.h
+++ b/include/cutlass/arch/simd.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h
index cd0babd5..36030a36 100644
--- a/include/cutlass/arch/simd_sm60.h
+++ b/include/cutlass/arch/simd_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h
index e8d5c889..94f1c617 100644
--- a/include/cutlass/arch/simd_sm61.h
+++ b/include/cutlass/arch/simd_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h
index 9843e134..88968abd 100644
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h
index 6c989c9a..94eeb93d 100644
--- a/include/cutlass/arch/wmma_sm70.h
+++ b/include/cutlass/arch/wmma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h
index 477a72c3..1b8cc116 100644
--- a/include/cutlass/arch/wmma_sm72.h
+++ b/include/cutlass/arch/wmma_sm72.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h
index 2985be58..f630712f 100644
--- a/include/cutlass/arch/wmma_sm75.h
+++ b/include/cutlass/arch/wmma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -120,8 +120,7 @@ struct Wmma<
 ////////////////////////////////////////////////////////////////////////////////
 //
 // WMMA template structure defines nvcuda::wmma::fragments and static assert for
-// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1)
-// (nvcuda::wmma targetting SASS instruction BMMA)
+// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1).
 //
 ////////////////////////////////////////////////////////////////////////////////
 template <
diff --git a/include/cutlass/array.h b/include/cutlass/array.h
index be14a879..0018b76f 100644
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -167,7 +167,7 @@ public:
   class const_iterator {
 
     /// Pointer to object
-    T *ptr_;
+    const T *ptr_;
 
   public:
 
diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h
index b340c890..78081fac 100644
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
new file mode 100644
index 00000000..c3bd1782
--- /dev/null
+++ b/include/cutlass/bfloat16.h
@@ -0,0 +1,461 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing non-standard 16-bit floating point values with
+          8 bits of exponent and 7 bit of mantissa.
+*/
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#endif
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Floating-point type with 8 bits of exponent and 7 bits of mantissa.
+struct alignas(2) bfloat16_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint16_t storage;
+
+  //
+  // Methods
+  //
+
+  /// Constructs from an unsigned short
+  CUTLASS_HOST_DEVICE
+  static bfloat16_t bitcast(uint16_t x) {
+    bfloat16_t h;
+    h.storage = x;
+    return h;
+  }
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  bfloat16_t() { }
+
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(float x) {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+
+    asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x));
+
+    #else
+    uint32_t bits = reinterpret_cast<uint32_t &>(x);
+
+    if ((bits & 0x7f800000) != 0x7f800000) {
+
+      bool mantissa_bit = ((bits & (1 << 16)) != 0);
+      bool round_bit = ((bits & (1 << 15)) != 0);
+      bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0);
+      
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        bits += uint32_t(1 << 16);
+      }
+    }
+    else if (bits & ~0xff800000) {
+      bits = 0x7fffffff;
+    }
+
+    storage = uint16_t((bits >> 16) & 0xffff);
+    #endif
+  }
+
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(double x): bfloat16_t(float(x)) {
+
+  }
+
+  /// Integer conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(int x) {
+    float flt = static_cast<float>(x);
+    storage = uint16_t(reinterpret_cast<uint32_t const &>(flt) >> 16);
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    unsigned bits = (unsigned(storage) << 16);
+    return reinterpret_cast<float const &>(bits);
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator double() const {
+    return double(float(*this));
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint16_t raw() const {
+    return storage;
+  }
+    /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x8000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 7) & 0x0ff);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7f);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::bfloat16_t const& h) {
+  return h.signbit();
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) {
+  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t nan_bf16(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::bfloat16_t::bitcast(0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::bfloat16_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::bfloat16_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::bfloat16_t(sqrtf(float(h)));
+#else
+  return cutlass::bfloat16_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) {
+
+  uint16_t a_mag = (reinterpret_cast<uint16_t const &>(a) & 0x7fff);  
+  uint16_t b_sign = (reinterpret_cast<uint16_t const &>(b) & 0x8000);
+  uint16_t result = (a_mag | b_sign);
+
+  return reinterpret_cast<bfloat16_t const &>(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace std {
+
+#if !defined(__CUDACC_RTC__)
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+#endif
+
+} // namespace std
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return bfloat16_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs) {
+  return bfloat16_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return bfloat16_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return bfloat16_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+  return bfloat16_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+  lhs = bfloat16_t(float(lhs) + float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+  lhs = bfloat16_t(float(lhs) - float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+  lhs = bfloat16_t(float(lhs) * float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+  lhs = bfloat16_t(float(lhs) / float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator++(bfloat16_t & lhs) {
+  float tmp(lhs);
+  ++tmp;
+  lhs = bfloat16_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator--(bfloat16_t & lhs) {
+  float tmp(lhs);
+  --tmp;
+  lhs = bfloat16_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator++(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+  float tmp(lhs);
+  tmp++;
+  lhs = bfloat16_t(tmp);
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator--(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+  float tmp(lhs);
+  tmp--;
+  lhs = bfloat16_t(tmp);
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(long double x) {
+  return cutlass::bfloat16_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) {
+  return cutlass::bfloat16_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index 20c4a64a..6f7d73bb 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -35,6 +35,9 @@
 #include "cutlass/half.h"
 #include "cutlass/real.h"
 
+#include "cutlass/bfloat16.h"
+#include "cutlass/tfloat32.h"
+
 #if !defined(__CUDACC_RTC__)
 #include <iosfwd>
 #endif
@@ -370,6 +373,15 @@ template <typename T>
 CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const &z) {
   return complex<T>(real(z), -imag(z));
 }
+/// Indentity transform for non-complex types
+template <typename T>
+CUTLASS_HOST_DEVICE T conj(T const &z) {
+    static_assert( !std::is_same<T, cuComplex>::value &&
+                   !std::is_same<T, cuDoubleComplex>::value &&
+                   !std::is_same<T, cutlass::complex<double>>::value &&
+                   !std::is_same<T, cutlass::complex<float>>::value, "May not be a complex data type");
+  return z;
+}
 
 /// Projects the complex number z onto the Riemann sphere
 template <typename T>
@@ -429,6 +441,7 @@ template <typename T>
 struct RealType< complex<T> > {
   using Type = T;
 
+CUTLASS_HOST_DEVICE
   static complex<T> from_real(double x) {
     return complex<T>(static_cast<T>(x));
   }
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index e2615755..82613c24 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -360,6 +360,29 @@ public:
 
 namespace cutlass {
 
+
+/// Scalar multiplication
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator*(Index s, Coord<Rank, Index> coord) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] *= s;
+  }
+  return coord;
+}
+
+/// Scalar multiplication
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator*(Coord<Rank, Index> coord, Index s) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] *= s;
+  }
+  return coord;
+}
+
 /// Scalar division
 template <int Rank, typename Index>
 CUTLASS_HOST_DEVICE
@@ -419,3 +442,4 @@ Coord<4> make_Coord(int _0, int _1, int _2, int _3) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace cutlass
+
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index d9dc7890..a87ecfa7 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -33,9 +33,14 @@
 
 #include "cutlass/coord.h"
 #include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/gemm/gemm.h"
 
 namespace cutlass {
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                    stream operators for cutlass namespace                                     //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <int Rank>
@@ -47,8 +52,6 @@ std::ostream& operator<<(std::ostream& out, Coord<Rank> const& coord) {
   return out;
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
 inline
 std::istream & operator>>(std::istream &stream, half_t &x) {
   float tmp;
@@ -62,6 +65,16 @@ std::ostream & operator<<(std::ostream &out, half_t const &x) {
   return out << float(x);
 }
 
+inline
+std::ostream & operator<<(std::ostream &out, bfloat16_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, tfloat32_t const &x) {
+  return out << float(x);
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Helper to enable formatted printing of CUTLASS scalar types to an ostream
@@ -98,7 +111,54 @@ inline std::ostream &operator<<(std::ostream &out, ScalarIO<uint8_t> const &scal
   return out << unsigned(scalar.value);
 }
 
+
+/// Default printing to ostream for MatrixShape
+template <int Row, int Column>
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::MatrixShape<Row, Column> const &matrix_shape) {
+  out << "cutlass::MatrixShape::(kRow, kColumn) {"
+    << cutlass::MatrixShape<Row,Column>::kRow <<","
+    << cutlass::MatrixShape<Row,Column>::kColumn <<"}";
+  return out;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                         stream operators for cutlass::gemm namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace gemm {
+
+/// Default printing to ostream for GemmShape
+template <int M, int N, int K>
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::gemm::GemmShape<M,N,K> const &gemm_shape) {
+  out << "cutlass::GemmShape::(kM, kN, kK) {"
+    << cutlass::gemm::GemmShape<M,N,K>::kM <<","
+    << cutlass::gemm::GemmShape<M,N,K>::kN <<","
+    << cutlass::gemm::GemmShape<M,N,K>::kK << "}";
+  return out;
+}
+
+} //namespace gemm
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                       stream operators for cutlass::layout namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace layout {
+
+/// Default printing to ostream for PitchLinearShape
+template < int Contiguous, int Strided>
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::layout::PitchLinearShape<Contiguous, Strided> const &pitch_linear_shape) {
+  out << "cutlass::layout::PitchLinearShape::(kContiguous, kStrided) {"
+    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kContiguous <<","
+    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kStrided <<"}";
+  return out;
+}
+
+} //namespace layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass
-
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index b5a0e5f4..860dc3e5 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -55,6 +55,8 @@ enum class Status {
   kErrorNotSupported,          ///< Operation is not supported on current device.
   kErrorWorkspaceNull,         ///< The given workspace is null when it is required to be non-null.
   kErrorInternal,              ///< An error within CUTLASS occurred.
+  kErrorArchMismatch,          ///< CUTLASS runs on a device that it was not compiled for.
+  kErrorInsufficientDriver,    ///< CUTLASS runs with a driver that is too old.
   kInvalid                     ///< Status is unspecified.
 };
 
@@ -78,6 +80,10 @@ static char const* cutlassGetStatusString(cutlass::Status status) {
       return "Error Workspace Null";
     case cutlass::Status::kErrorInternal:
       return "Error Internal";
+    case cutlass::Status::kErrorInsufficientDriver:
+      return "Error Insufficient Driver";
+    case cutlass::Status::kErrorArchMismatch:
+      return "Erroor Architecture Mismatch";
     case cutlass::Status::kInvalid: break;
   }
 
diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
index 4a992bb3..f5166ab1 100644
--- a/include/cutlass/device_kernel.h
+++ b/include/cutlass/device_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
new file mode 100644
index 00000000..c0f42146
--- /dev/null
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This extends the contents of cutlass/functional.h with frequently used activation functions.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/complex.h"
+
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// ReLu operator - propagates NaNs
+template <typename T>
+struct ReLu {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & threshold, T const &value) const {
+    if (value < threshold) {
+      value = threshold;
+    }
+    return value;
+  }
+};
+
+template <typename T, int N>
+struct ReLu<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
+    Array<T, N> result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      T value = frag[i];
+      if (value < threshold) {
+        value = threshold;
+      }
+      result[i] = value;
+    }
+    return result;
+  }
+};
+
+// Sigmoid operator
+template <typename T>
+struct Sigmoid {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &scalar) const {
+    return T(1) / (T(1) + exp(-scalar));
+  }
+};
+
+template <>
+struct Sigmoid<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &scalar) const {
+    return 1.0f / (1.0f + expf(-scalar));
+  }
+};
+
+template <typename T, int N>
+struct Sigmoid<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+    Array<T, N> y;
+    Sigmoid<T> sigmoid_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(rhs.size()); ++i) {
+      y[i] = sigmoid_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h
index 32b885bc..ad17d414 100644
--- a/include/cutlass/epilogue/thread/conversion_op.h
+++ b/include/cutlass/epilogue/thread/conversion_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -101,7 +101,7 @@ public:
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source,
+    FragmentOutput const &source = FragmentOutput(),
     ElementCompute uniform = ElementCompute(0)) const {
 
     // Convert to destination numeric type
diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h
index dd8236b3..8b5f6ead 100644
--- a/include/cutlass/epilogue/thread/linear_combination.h
+++ b/include/cutlass/epilogue/thread/linear_combination.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -165,6 +165,28 @@ public:
 
     return destination_converter(intermediate);
   }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum 
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index 9fe4b2b3..25611bd3 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -178,6 +178,40 @@ public:
 
     return destination_converter(intermediate);
   }
+
+  /// Computes linear scaling: D = alpha * accumulator 
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute((1U << (sizeof_bits<ElementOutput>::value - 1)) - 1);
+
+    intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1));
+    intermediate = min_accumulator(intermediate, kClamp);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -278,7 +312,7 @@ public:
       beta_ = ElementCompute(1);
     }
   }
-
+  
   /// Computes linear scaling: D = alpha * accumulator + beta * source
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
@@ -316,6 +350,37 @@ public:
 
     return destination_converter(scaled_accumulator);
   }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_accumulator;
+    
+    // Float min-max
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    // Convert floats back to INT
+    FragmentAccumulator scaled_accumulator;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      scaled_accumulator[i] = static_cast<int>(intermediate[i]);
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
+
+    return destination_converter(scaled_accumulator);
+  }
 };
 
 #endif // Conditional guards to enable partial specialization for packed integers
@@ -410,7 +475,7 @@ class FastLinearCombinationClamp {
       beta_ = ElementCompute(1);
     }
   }
-
+  
   /// Computes linear scaling: D = alpha * accumulator + beta * source
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(FragmentAccumulator const &accumulator,
@@ -453,6 +518,41 @@ class FastLinearCombinationClamp {
 
     return destination_converter(intermediate);
   }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    // Float min-max
+    intermediate = mul_accumulator(alpha_, converted_accumulator);
+
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
index bfe6be78..3934af10 100644
--- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -185,6 +185,39 @@ public:
       destination_converter(intermediate.real), 
       destination_converter(intermediate.imag));
   }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator(
+      accumulator_converter(accumulator.real), 
+      accumulator_converter(accumulator.imag));
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+
+    // complex multiply-add: I = alpha * AB + I
+    intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real);
+    intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag);
+
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return FragmentOutput(
+      destination_converter(intermediate.real), 
+      destination_converter(intermediate.imag));
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h
index 9afeb3eb..7a2fa9e8 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -23,8 +23,7 @@
  *
  **************************************************************************************************/
 /*! \file
-  \brief Functor performing linear combination operations used by epilogues. Values are clamped before
-         converting to the output element type.
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
 */
 
 #pragma once
@@ -34,6 +33,7 @@
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -43,8 +43,7 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Applies a linear combination operator to an array of elements then clamps the output before
-/// converting to the output element type.
+/// Applies a linear combination operator to an array of elements.
 ///
 /// D = alpha * accumulator + beta * source + uniform
 ///
@@ -75,10 +74,10 @@ public:
 
     ElementCompute alpha;                  ///< scales accumulators
     ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< Relu threshold
+    ElementCompute threshold;              ///< minimum value that is output 
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
+    ElementCompute const *threshold_ptr;   ///< pointer to threshold scalar - if not null, loads from memory
     //
     // Methods
     //
@@ -87,16 +86,17 @@ public:
     Params(): 
       alpha(ElementCompute(1)), 
       beta(ElementCompute(0)),
-      threshold(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
       alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
+      beta_ptr(nullptr),
+      threshold_ptr(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta,
-      ElementCompute threshold  = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) {
 
     }
 
@@ -104,8 +104,8 @@ public:
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+      ElementCompute const *threshold_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) {
 
     }
   };
@@ -128,7 +128,7 @@ public:
 
     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
+    threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold);
   }
 
   /// Returns true if source is needed
@@ -144,13 +144,12 @@ public:
       beta_ = ElementCompute(1);
     }
   }
-
+  
   /// Computes linear scaling: D = alpha * accumulator + beta * source
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source,
-    ElementCompute uniform = ElementCompute(0)) const {
+    FragmentOutput const &source) const {
 
     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
@@ -160,18 +159,44 @@ public:
     ComputeFragment converted_accumulator = accumulator_converter(accumulator);
 
     // Perform binary operations
-
     ComputeFragment intermediate;
 
     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;
-    
-    maximum<ComputeFragment> max_accumulator;
+    ReLu<ComputeFragment> relu;
 
     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    
-    intermediate = max_accumulator(intermediate, threshold_);
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    ReLu<ComputeFragment> relu;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
@@ -180,24 +205,24 @@ public:
   }
 };
 
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Conditional guards to enable partial specialization for packed integers
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \
-    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
-     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
 
-/// Applies a linear combination operator to an array of elements then clamps the output before
-/// converting to the output element type.
+/// Applies a linear combination operator to an array of elements.
 ///
 /// D = alpha * accumulator + beta * source + uniform
 ///
+/// Special handling for int types
+
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
   int Count,                                           ///< Number of elements computed per operation
   FloatRoundStyle Round
 >
-class LinearCombinationRelu<ElementOutput_, Count, int, float, Round> {
+class LinearCombinationRelu <ElementOutput_, Count, int, float, Round> {
 public:
 
   using ElementOutput = ElementOutput_;
@@ -217,10 +242,10 @@ public:
 
     ElementCompute alpha;                  ///< scales accumulators
     ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< Relu threshold
+    ElementCompute threshold;              ///< minimum value that is output 
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
+    ElementCompute const *threshold_ptr;   ///< pointer to threshold scalar - if not null, loads from memory
     //
     // Methods
     //
@@ -229,16 +254,17 @@ public:
     Params(): 
       alpha(ElementCompute(1)), 
       beta(ElementCompute(0)),
-      threshold(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
       alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
+      beta_ptr(nullptr),
+      threshold_ptr(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta,
-      ElementCompute threshold  = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) {
 
     }
 
@@ -246,8 +272,8 @@ public:
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+      ElementCompute const *threshold_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) {
 
     }
   };
@@ -270,7 +296,7 @@ public:
 
     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
+    threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold);
   }
 
   /// Returns true if source is needed
@@ -286,13 +312,12 @@ public:
       beta_ = ElementCompute(1);
     }
   }
-
+  
   /// Computes linear scaling: D = alpha * accumulator + beta * source
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source,
-    ElementCompute uniform = ElementCompute(0)) const {
+    FragmentOutput const &source) const {
 
     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
@@ -302,21 +327,16 @@ public:
     ComputeFragment converted_accumulator = accumulator_converter(accumulator);
 
     // Perform binary operations
-
     ComputeFragment intermediate;
 
     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;
-    
-    maximum<ComputeFragment> max_accumulator;
+    ReLu<FragmentAccumulator> relu;
 
     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    
-    // Clamp to theshold
-    intermediate = max_accumulator(intermediate, threshold_);
 
-    // Convert back to accumulator data type
+    // Convert floats back to INT
     FragmentAccumulator scaled_accumulator;
 
     CUTLASS_PRAGMA_UNROLL
@@ -324,8 +344,46 @@ public:
       scaled_accumulator[i] = static_cast<int>(intermediate[i]);
     }
 
-    // Convert to destination numeric type and pack
-    NumericArrayConverter<ElementOutput, ElementAccumulator, kCount, Round> destination_converter;
+    // Compute threshold optionally
+    scaled_accumulator = relu(threshold_, scaled_accumulator);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
+
+    return destination_converter(scaled_accumulator);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    ReLu<FragmentAccumulator> relu;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    // Convert floats back to INT
+    FragmentAccumulator scaled_accumulator;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      scaled_accumulator[i] = static_cast<int>(intermediate[i]);
+    }
+
+    // Compute threshold optionally
+    scaled_accumulator = relu(threshold_, scaled_accumulator);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
 
     return destination_converter(scaled_accumulator);
   }
@@ -338,3 +396,6 @@ public:
 } // namespace thread
 } // namespace epilogue
 } // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
new file mode 100644
index 00000000..3a65c49a
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -0,0 +1,206 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationSigmoid {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationSigmoid(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    Sigmoid<ComputeFragment> sigmoid;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    intermediate = sigmoid(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_accumulator;
+    Sigmoid<ComputeFragment> sigmoid;
+
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    intermediate = sigmoid(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h
index b33332e9..0331f0fa 100644
--- a/include/cutlass/epilogue/thread/reduction_op.h
+++ b/include/cutlass/epilogue/thread/reduction_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
index c3c40bab..67fccf05 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -45,6 +45,7 @@
 #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
 
 #include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
 #include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
 #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
@@ -76,6 +77,7 @@ template <
   /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
   int ElementsPerAccess,
   /// Multiply-add operator 
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
   typename Operator_ = arch::OpMultiplyAddComplex> 
 struct DefaultEpilogueComplexTensorOp {
 
@@ -146,6 +148,91 @@ struct DefaultEpilogueComplexTensorOp {
   >;
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueComplexTensorOp <Shape_, WarpMmaTensorOp_, PartitionsK, 
+                                      OutputOp_, ElementsPerAccess, 
+                                      arch::OpMultiplyAddGaussianComplex> {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 } // namespace threadblock
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
index 081bcbac..bb2fdb6b 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -148,6 +148,44 @@ struct DefaultEpiloguePlanarComplex<
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm80,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Defines sensible defaults for epilogues.
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
index d39ad1d9..00bf26d3 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,6 +39,7 @@
 #include "cutlass/gemm/gemm.h"
 
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
index 5afb1f22..51ebab37 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,16 +39,20 @@
 #include "cutlass/gemm/gemm.h"
 
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
 #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
 
 #include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
 #include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
 #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 #include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
 
 #include "cutlass/epilogue/threadblock/epilogue.h"
 #include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
@@ -61,6 +65,177 @@ namespace threadblock {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+};
+
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t, 
+  float, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+};
+
+/// Partial specialization for int8_t x 16 <= int32_t x 16 epilogues avoids shared memory bank conflicts.
+template <
+  int K,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  int8_t, 
+  int32_t, 
+  16, 
+  gemm::GemmShape<128, 128, K>, 
+  gemm::GemmShape<64, 64, K>, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    gemm::GemmShape<64, 64, K>,
+    InstructionShape,
+    int32_t,
+    32,
+    8,
+    16,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    8,
+    16,
+    8
+  >;
+};
+
+/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  int K,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  int8_t, 
+  int32_t, 
+  8, 
+  gemm::GemmShape<128, 64, K>, 
+  gemm::GemmShape<64, 32, K>, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    gemm::GemmShape<64, 32, K>,
+    InstructionShape,
+    int32_t,
+    32,
+    8,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    8,
+    8,
+    8
+  >;
+};
+
+/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  int K,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  int8_t, 
+  int32_t, 
+  8, 
+  gemm::GemmShape<64, 64, K>, 
+  gemm::GemmShape<32, 32, K>, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    gemm::GemmShape<32, 32, K>,
+    InstructionShape,
+    int32_t,
+    32,
+    8,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    8,
+    8,
+    8
+  >;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Defines sensible defaults for epilogues for TensorOps.
 template <
   typename Shape_,
@@ -98,25 +273,33 @@ struct DefaultEpilogueTensorOp {
     ElementOutput
   >;
 
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::ElementC,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
+  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
 
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
     ElementAccumulator,
-    LayoutC
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
   >;
 
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
 
   /// Hard-coded padding elements added 
   using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
@@ -184,6 +367,7 @@ struct DefaultInterleavedEpilogueTensorOp {
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
index 8a08e036..7fec5110 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,6 +39,7 @@
 #include "cutlass/gemm/gemm.h"
 
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
index f0435e92..58425c28 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,6 +39,7 @@
 #include "cutlass/gemm/gemm.h"
 
 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
 #include "cutlass/epilogue/thread/conversion_op.h"
 #include "cutlass/epilogue/thread/reduction_op.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
index 788e07a7..8e8f4d33 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
index 6f4bd2ad..736e5525 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -146,54 +146,6 @@ struct DefaultInterleavedThreadMapTensorOp {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
-          typename Element_, int ElementsPerAccess, int InterleavedK>
-struct DefaultInterleavedConvThreadMapTensorOp {
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kInterleavedK = InterleavedK;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-    /// Tensor Operations fundamentally perform operations on 8 rows
-    static int const kTensorOpRows = 8;
-    static int const kWarpSize = 32;
-
-    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
-                      !(ThreadblockShape::kM % WarpShape::kM),
-                  "Divisibility");
-
-    /// Number of warps
-    using WarpCount =
-        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
-                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-  };
-
-  //
-  // ThreadMap
-  //
-
-  /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept
-  /// InterleavedOutputTileThreadMap
-  using Type = InterleavedConvOutputTileThreadMap<
-      MatrixShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
-      MatrixShape<WarpShape::kM / Detail::kTensorOpRows,
-                  WarpShape::kN / InterleavedK>,
-      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
index 4c4068a3..45aba393 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
index 376887c3..34ec750d 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h
index f197112b..f14be1ff 100644
--- a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h
index fe6877aa..07868420 100644
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -175,15 +175,106 @@ public:
     OutputOp const &output_op,                    ///< Output operator
     OutputTileIterator destination_iterator,      ///< Tile iterator for destination
     AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    int64_t imag_stride_dest = 0,                 ///< Arguments required for planar complex case - not used in real-valued case
-    int64_t imag_stride_src = 0) {                ///<
-
-    typename OutputTileIterator::Fragment source_fragment;
-
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    
     if (!output_op.is_source_needed()) {
-      source_iterator.clear_mask();
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
     }
+    else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+
+private:
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators) {        ///< Complete warp-level accumulator tile
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      this->warp_tile_iterator_.store(accum_fragment);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Compute the output result
+      //
+     
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]);
+
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);      
+      ++destination_iterator;
+
+    }
+  }
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    
+    typename OutputTileIterator::Fragment source_fragment;
 
     source_fragment.clear();
 
@@ -265,8 +356,6 @@ public:
     }
   }
 
-private:
-
   /// Helper to invoke the output functor over each vector of output
   CUTLASS_DEVICE
   void apply_output_operator_(
@@ -294,6 +383,30 @@ private:
       output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
     }
   }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
+    
+    OutputAccessType *output_frag_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+    }
+  }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h
index a8a0dc49..a9b5a414 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_base.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
index 8362748e..6cb99636 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
index 72eb8d2e..36d196a3 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
index 0a730ef1..b616545b 100644
--- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
+++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
index fd28ac75..4eb5e378 100644
--- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -490,67 +490,6 @@ struct InterleavedOutputTileThreadMap {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Template metaprogram for partitioning a 4D interleaved layout across warps
-/// to achieve several performance objectives:
-///
-///   - coalesced memory accesses in units of 64 Byte lines
-///   - minimal address arithmetic
-///   - minimal predicate calculations
-///
-template <typename WarpCount_, typename Iterations_, int Threads,
-          int ElementsPerAccess, int ElementSize>
-struct InterleavedConvOutputTileThreadMap {
-  using WarpCount = WarpCount_;
-
-  static int const kWarpSize = 32;
-  static int const kThreads = Threads;
-  static int const kWarpCount = kThreads / kWarpSize;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-
-  //
-  // Metaprogram computation
-  //
-
-  struct Detail {};
-
-  //
-  // Output
-  //
-
-  using Iterations = Iterations_;
-
-  using Delta = MatrixShape<kWarpSize / 4, 4 * kElementsPerAccess>;
-
-  /// Initial offset function
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord initial_offset(int thread_idx) {
-    int warp_idx = thread_idx / kWarpSize;
-    int lane_idx = thread_idx % kWarpSize;
-
-    // Compute warp location
-    MatrixCoord warp_footprint{
-        Delta::kRow * Iterations::kRow,
-        Delta::kColumn * Iterations::kColumn,
-    };
-
-    MatrixCoord warp_offset{warp_idx % WarpCount::kRow,
-                            warp_idx / WarpCount::kRow};
-
-    // Compute per-lane offset
-    MatrixCoord thread_offset_in_warp{lane_idx / 4,
-                                      (lane_idx % 4) * kElementsPerAccess};
-
-    MatrixCoord thread_offset_in_threadblock_tile =
-        warp_footprint * warp_offset + thread_offset_in_warp;
-
-    return thread_offset_in_threadblock_tile;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
index 486d16c7..f3c88300 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -41,7 +41,7 @@
 #include "cutlass/tensor_ref.h"
 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-
+#include "cutlass/arch/memory.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -306,10 +306,15 @@ public:
 
             bool guard = row_guard && mask_.predicates[column];
 
-            if (guard) {
-              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
-                memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess];
-            }
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
           }
 
           if (row + 1 < ThreadMap::Iterations::kRow) {
@@ -365,11 +370,12 @@ public:
 
             bool guard = row_guard && mask_.predicates[column];
 
-            if (guard) {
-              
-              memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
-            }
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
           }
 
           if (row + 1 < ThreadMap::Iterations::kRow) {
@@ -660,9 +666,13 @@ public:
 
     bool guard = col_guard && mask_.predicates[iteration_contiguous_];
 
-    if (guard) {
-      *frag_ptr = *memory_pointer;
-    }
+    cutlass::arch::global_load<
+      AccessType, 
+      sizeof(AccessType)
+    >(
+        *frag_ptr,
+        (void *)memory_pointer,
+        guard);
   }
 
   /// Stores a fragment to memory
@@ -678,9 +688,8 @@ public:
 
     bool guard = col_guard && mask_.predicates[iteration_contiguous_];
 
-    if (guard) {
-      *memory_pointer = *frag_ptr;
-    }
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
   }
 
   /// Overrides the internal iteration index
@@ -732,6 +741,7 @@ public:
   }
 };
 
+///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
 } // namespace threadblock
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
index 5e4a64b1..0aa3dbb1 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -96,6 +96,15 @@ public:
     ThreadMap::kElementsPerAccess, 
     kAlignment>;
 
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+    const_min(16, kAlignment)
+  >;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
 private:
 
   //
@@ -149,7 +158,6 @@ public:
   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
 
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
 
     CUTLASS_PRAGMA_UNROLL
     for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
@@ -169,15 +177,19 @@ public:
           int frag_row_idx = 
             (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
 
-          AccessType const *memory_pointer = reinterpret_cast<AccessType const *>(byte_pointer);
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+          LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
 
           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
             
             int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
 
-            frag_ptr[frag_idx] = 
-              memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess];            
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = 
+                memory_pointer[(column * ThreadMap::Delta::kColumn / kElementsPerAccess) * kLoadsPerAccess + v];
+            }
           }
         }
       }
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
new file mode 100644
index 00000000..d37b07d5
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -0,0 +1,559 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops optimized for mixed-precision.
+
+  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
+
+  When the fragment is loaded into registers, it matches the row-major thread map assumed by
+  the predicated tile iterator writing to global memory.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Accumulator data type
+  int ElementSizeBits_,      ///< Size of accumulator in bits
+  int OutputSizeBits_,       ///< Size of output element in bits
+  int ElementsPerAccess,     ///< Vector length of output vector
+  int ContiguousLanes        ///< Number of lanes in the warp writing to contiguous elements
+                             ///  in the global memory tensor
+>
+class SharedLoadIteratorMixed;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Accumulator data type
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    ThreadMap::kElementsPerAccess, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+    const_min(16, kAlignment)
+  >;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointers
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] = reinterpret_cast<LoadType const *>(ref.data());
+
+      int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
+      int bank_offset = (col_idx * sizeof(LoadType) / 128) % kLoadsPerAccess;
+
+      col_idx += (bank_offset + i) % kLoadsPerAccess;
+
+      pointers_[i] += thread_offset.row() * stride_ + col_idx;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_ += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess); 
+
+              LoadType const *memory_pointer = pointers_[v] + row_ptr_offset;
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 16 => int8_t x 16
+template <
+  typename ThreadMap_       ///< Thread map (conept: OutputTileThreadMap)
+>
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 16, 8> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = int32_t;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = 16;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    16, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    4,
+    16
+  >;
+
+  static int const kLoadsPerAccess = 4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+    
+    // Initialize pointers
+    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
+      
+    int lane_col_idx = thread_offset.column() / 16;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      int lane_offset = (lane_col_idx % 2) * 4 | ((lane_col_idx / 2) * 8) | ((lane_col_idx / 2) ^ i);
+ 
+      pointers_[i] = base_ptr + lane_offset;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              LoadType const *memory_pointer = pointers_[v];
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 8 => int8_t x 8
+template <
+  typename ThreadMap_       ///< Thread map (conept: OutputTileThreadMap)
+>
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 8, 8> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = int32_t;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    8, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    4,
+    16
+  >;
+
+  static int const kLoadsPerAccess = 2;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+    
+    // Initialize pointers
+    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
+      
+    int lane_col_idx = thread_offset.column() / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      int lane_offset = (lane_col_idx % 8) * 2 | ((lane_col_idx / 4) ^ i);
+
+      pointers_[i] = base_ptr + lane_offset;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              LoadType const *memory_pointer = pointers_[v];
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
index d369a835..1bab9104 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
new file mode 100644
index 00000000..4c956492
--- /dev/null
+++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -0,0 +1,188 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorGaussianComplexTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
+  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorGaussianComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    complex<OperatorElementC>, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// Size of one part of accumulator of 3-part accumulator in units of number of OperatorElementC
+  static int const kElementsAccumulatorPerPart = 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
+
+  /// Offset into the accumulator fragment part 1
+  static int const kPart1Index = kElementsAccumulatorPerPart * 0;
+
+  /// Offset into the accumulator fragment part 2
+  static int const kPart2Index = kElementsAccumulatorPerPart * 1;
+
+  /// Offset into the accumulator fragment part 3
+  static int const kPart3Index = kElementsAccumulatorPerPart * 2;
+
+  /// This is the complete warp-level accumulator tile holding part1, part2, and part3
+  using AccumulatorTile = Array<OperatorElementC, kElementsAccumulatorPerPart * 3>;
+
+  /// This is the complete warp-level accumulator tile holding final output of complex<T> type 
+  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kElementsAccumulatorPerPart>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      auto const & part1_accum_array = accumulators_[accumulator_access_offset + kPart1Index];
+      auto const & part2_accum_array = accumulators_[accumulator_access_offset + kPart2Index / Policy::kElementsPerAccess];
+      auto const & part3_accum_array = accumulators_[accumulator_access_offset + kPart3Index / Policy::kElementsPerAccess];
+
+      // Pack parts 1, 2, and 3 into a structure. This is likely to result in MOVs
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
+
+        frag_ptr[n][i].real() = part1_accum_array[i] - part3_accum_array[i];
+        frag_ptr[n][i].imag() = part1_accum_array[i] + part2_accum_array[i]; 
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
index 16084420..6d75e569 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
index e19f12b9..f620e4bd 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
index 15c095ff..1abbbdc0 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
index b96b4c5b..79106b11 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h
index 1d010c68..3e096978 100644
--- a/include/cutlass/epilogue/warp/simt_policy.h
+++ b/include/cutlass/epilogue/warp/simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h
index c02656a5..82e685b8 100644
--- a/include/cutlass/epilogue/warp/tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h
index 2bf92e01..a9d03db1 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
index d934c05a..04c361f5 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
new file mode 100644
index 00000000..82a93e2d
--- /dev/null
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -0,0 +1,675 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory. This is optimized
+/// for mixed-precision epilogues in which the accumulators are 32b in width, but the output
+/// data type is smaller. 
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,              ///< data type of accumulator element
+  int ElementSizeBits,            ///< Size of accumulator element in bits
+  int OutputSizeBits,             ///< Size of output element in bits
+  int OutputElementCount,         ///< number of elements in output vector
+  int ContiguousLanes             ///< Number of consecutive lanes writing to contiguous memory
+>
+class TileIteratorTensorOpMixed {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = OutputElementCount;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 
+      (OutputElementCount * sizeof_bits<Element>::value) / (const_min(128, OutputElementCount * sizeof_bits<Element>::value));
+
+    static_assert(kPointerCount <= 4, "Can only accommodate four pointers at present.");
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount];
+
+  /// Stride in units of AccessType
+  int stride_;
+
+  /// Logical column in which warp tile is aligned
+  int warp_column_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] = nullptr;
+    }
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / Policy::kElementsPerAccess),
+    warp_column_(0) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = (lane_in_quad % 2) + (((lane_in_quad / 2) + i) % Detail::kPointerCount) * 2;
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0 % Detail::kPointerCount] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1 % Detail::kPointerCount] = ptr;
+      }
+      else if (i == 2) {
+        pointers_[2 % Detail::kPointerCount] = ptr;
+      }
+      else if (i == 3) {
+        pointers_[3 % Detail::kPointerCount] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / Policy::kElementsPerAccess;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += tile_offset.row() * Shape::kRow * stride_ + 
+        tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess;
+    }
+
+    warp_column_ += tile_offset.column() * Shape::kColumn;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+      int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0 % Detail::kPointerCount];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1 % Detail::kPointerCount];
+      }
+      else if (ptr_idx == 2) {
+        ptr = pointers_[2 % Detail::kPointerCount];
+      }
+      else if (ptr_idx == 3) {
+        ptr = pointers_[3 % Detail::kPointerCount];
+      }
+
+      int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess;
+#if 0
+      // Using inline PTX to avoid generic memory
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+      int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+
+      AccessType const *smem_ptr = pointers_[ptr_idx];
+      frag_ptr[n] = smem_ptr[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 16 => int8_t x 16
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, 8, 16, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = int32_t;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 16;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    /// Offsets added 
+    static int const kOffsetCount = 4;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount];
+
+  /// Stride in units of AccessType
+  int stride_;
+
+  /// Uniform offset in bytes added to warp tile iterator
+  int uniform_offset_[Detail::kOffsetCount];
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] = nullptr;
+    }
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+    
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+      int offset_idx = (n % 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 8 => int8_t x 8
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, 8, 8, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = int32_t;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 8;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount];
+
+  /// Stride in units of AccessType
+  int stride_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] = nullptr;
+    }
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+    
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+   
+    if (tile_offset.column() % 2) {
+      auto tmp = pointers_[0];
+      pointers_[0] = pointers_[1];
+      pointers_[1] = tmp;
+    }
+ 
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
index a9ca2315..8ffb5ec1 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
index e8299f9d..6017b5c7 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
index 631d423e..b0ecc5eb 100644
--- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
index fc312c7a..7b938d37 100644
--- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index ebc821ed..036b08e2 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index f712e04a..13ee7f54 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -1,5 +1,5 @@
   /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -96,6 +96,16 @@ struct multiply_add {
   }
 };
 
+/// Fused multiply-add
+template <typename T>
+struct and_add {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b, T const &c) const {
+    return ((a & b) + c);
+  }
+};
+
+
 /// Fused multiply-add
 template <typename T>
 struct xor_add {
@@ -1207,6 +1217,212 @@ struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a, 
+    Array<bfloat16_t, N> const &b, 
+    Array<bfloat16_t, N> const &c) const {
+    
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
+        : "=r"(result_ptr[i]) 
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
+        : "=h"(result_ptr[N - 1]) 
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    bfloat16_t const &a, 
+    Array<bfloat16_t, N> const &b, 
+    Array<bfloat16_t, N> const &c) const {
+    
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned a_packed = static_cast<unsigned>(a.raw());
+    a_packed = (a_packed | (a_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
+        : "=r"(result_ptr[i]) 
+        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
+        : "=h"(result_ptr[N - 1]) 
+        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a, 
+    bfloat16_t const &b, 
+    Array<bfloat16_t, N> const &c) const {
+    
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
+        : "=r"(result_ptr[i]) 
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
+      );
+    }
+
+    if (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
+        : "=h"(result_ptr[N - 1]) 
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a, 
+    Array<bfloat16_t, N> const &b, 
+    bfloat16_t const &c) const {
+    
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
+        : "=r"(result_ptr[i]) 
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
+      );
+    }
+
+    if (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
+        : "=h"(result_ptr[N - 1]) 
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass
diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h
index fff34dc4..c65b3f00 100644
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -422,6 +422,342 @@ struct DefaultGemmConfiguration<
   using Operator = arch::OpMultiplyAddSaturate;
 };
 
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint1b_t, 
+  uint1b_t, 
+  ElementC, 
+  int32_t> {
+    
+  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 512>;
+  using WarpShape = GemmShape<64, 64, 512>;
+  using InstructionShape = GemmShape<8, 8, 128>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpXorPopc;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, ElementA,
+                                ElementB, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 16>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = typename platform::conditional<
+      (platform::is_same<ElementA, int8_t>::value ||
+       platform::is_same<ElementA, int4b_t>::value ||
+       platform::is_same<ElementA, uint8_t>::value ||
+       platform::is_same<ElementA, uint4b_t>::value),
+      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+template <typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
+                                double, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 16>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+
+template <>
+struct DefaultGemmConfiguration<
+    arch::OpClassTensorOp, 
+    arch::Sm80, 
+    complex<double>,
+    complex<double>, 
+    complex<double>,
+    complex<double>
+  > {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<64, 64, 16>;
+  using WarpShape = GemmShape<32, 32, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      complex<double>, 1, complex<double>,
+      complex<double>>;
+
+  using Operator = arch::OpMultiplyAddComplex;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+     
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint1b_t, 
+  uint1b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 512>;
+  using WarpShape = GemmShape<64, 64, 512>;
+  using InstructionShape = GemmShape<16, 8, 256>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 ////////////////////////////////////////////////////////////////////////////////
 } // namespace device
 } // namespace gemm
diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
index c91aac20..70383e15 100644
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -193,7 +193,7 @@ template <
         ElementAccumulator_>::EpilogueOutputOp,
     /// Threadblock-level swizzling operator
     typename ThreadblockSwizzle_ =
-        typename threadblock::GemmCohortThreadblockSwizzle<LayoutA_, LayoutB_>,
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
     /// Number of stages used in the pipelined mainloop
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h
index 616eb5f2..c44579e0 100644
--- a/include/cutlass/gemm/device/gemm_array.h
+++ b/include/cutlass/gemm/device/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h
index 79fd4292..052bd900 100644
--- a/include/cutlass/gemm/device/gemm_batched.h
+++ b/include/cutlass/gemm/device/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h
index 2e6eb1b0..8ad1036b 100644
--- a/include/cutlass/gemm/device/gemm_complex.h
+++ b/include/cutlass/gemm/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -192,7 +192,7 @@ template <
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::EpilogueOutputOp,
     /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle,
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
     /// Number of stages used in the pipelined mainloop
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
@@ -202,6 +202,7 @@ template <
     /// Complex elementwise transformation on B operand
     ComplexTransform TransformB = ComplexTransform::kNone,
     /// Multiply-add operator 
+    // (selects complex or gaussian complex)
     typename Operator_ = arch::OpMultiplyAddComplex,
     /// If true, kernel supports split-K with serial reduction
     bool SplitKSerial = false
@@ -506,6 +507,7 @@ template <
   /// Complex elementwise transformation on B operand
   ComplexTransform TransformB,
   /// Multiply-add operator 
+  // (selects complex or gaussian complex)
   typename Operator_,
   /// If true, kernel supports split-K as a serial reduction
   bool SplitKSerial
@@ -571,8 +573,8 @@ public:
     EpilogueOutputOp,
     ThreadblockSwizzle,
     Stages,
-    TransformA,
     TransformB,
+    TransformA,
     Operator,
     SplitKSerial
   >;
diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h
index df11ba5b..73f1c240 100644
--- a/include/cutlass/gemm/device/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h
index 4b57fa0d..09129090 100644
--- a/include/cutlass/gemm/device/gemm_universal.h
+++ b/include/cutlass/gemm/device/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -89,7 +89,7 @@ template <
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::EpilogueOutputOp,
     /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle,
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
     /// Number of stages used in the pipelined mainloop
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 070914f2..12a8a6d7 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -41,14 +41,78 @@ namespace device {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace detail {
+
+  template <
+    typename ElementA_, 
+    typename LayoutA_, 
+    ComplexTransform TransformA,
+    int AlignmentA,
+    typename ElementB_,
+    typename LayoutB_,
+    ComplexTransform TransformB,
+    int AlignmentB,
+    typename LayoutC_,
+    bool Transpose
+  >
+  struct MapArguments {
+    using ElementA = ElementA_;
+    using LayoutA = LayoutA_;
+    static ComplexTransform const kTransformA = TransformA;
+    static int const kAlignmentA = AlignmentA; 
+    using ElementB = ElementB_;
+    using LayoutB = LayoutB_;
+    static ComplexTransform const kTransformB = TransformB;
+    static int const kAlignmentB = AlignmentB; 
+    using LayoutC = LayoutC_;
+  };
+
+  template <
+    typename ElementA_, 
+    typename LayoutA_, 
+    ComplexTransform TransformA,
+    int AlignmentA,
+    typename ElementB_,
+    typename LayoutB_,
+    ComplexTransform TransformB,
+    int AlignmentB,
+    typename LayoutC_
+  >
+  struct MapArguments<
+    ElementA_,
+    LayoutA_,
+    TransformA,
+    AlignmentA, 
+    ElementB_,
+    LayoutB_,
+    TransformB,
+    AlignmentB,
+    LayoutC_,
+    true
+  > {
+    using ElementA = ElementB_;
+    using LayoutA = typename layout::LayoutTranspose<LayoutB_>::type;
+    static ComplexTransform const kTransformA = TransformB;
+    static int const kAlignmentA = AlignmentB; 
+    using ElementB = ElementA_;
+    using LayoutB = typename layout::LayoutTranspose<LayoutA_>::type;
+    static ComplexTransform const kTransformB = TransformA;
+    static int const kAlignmentB = AlignmentA; 
+    using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
+  };
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 template <typename GemmKernel_>
 class GemmUniversalAdapter {
 public:
 
   using GemmKernel = GemmKernel_;
 
-  static_assert(std::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value,
-    "Universal adapter expects the kernel to be row-major and transposes its arguments.");
+  static bool const kInternalTranspose = 
+    std::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
 
   using ThreadblockShape = typename GemmKernel::Mma::Shape;
   using WarpShape = typename GemmKernel::WarpShape;
@@ -56,26 +120,39 @@ public:
  
   using OperatorClass = typename GemmKernel::OperatorClass;
   using ArchTag = typename GemmKernel::ArchTag;
- 
+
   // Type, layout, and complex transform deliberately exchanged with B
-  using ElementA = typename GemmKernel::ElementB;
-  using LayoutA = typename layout::LayoutTranspose<typename GemmKernel::LayoutB>::type;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  static ComplexTransform const kTransformA = GemmKernel::kTransformB;
+  using MapArguments = detail::MapArguments<
+    typename GemmKernel::ElementA,
+    typename GemmKernel::LayoutA,
+    GemmKernel::kTransformA,
+    GemmKernel::kAlignmentA,
+    typename GemmKernel::ElementB,
+    typename GemmKernel::LayoutB,
+    GemmKernel::kTransformB,
+    GemmKernel::kAlignmentB,
+    typename GemmKernel::LayoutC,
+    kInternalTranspose
+  >;
+
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
   static int const kAlignmentA = GemmKernel::kAlignmentA;
 
-  // Type, layout, and complex transform deliberately exchanged with A
-  using ElementB = typename GemmKernel::ElementA;
-  using LayoutB = typename layout::LayoutTranspose<typename GemmKernel::LayoutA>::type;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  static ComplexTransform const kTransformB = GemmKernel::kTransformA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
   static int const kAlignmentB = GemmKernel::kAlignmentB;
-
+  
   using ElementC = typename GemmKernel::ElementC;
-  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutC = typename MapArguments::LayoutC;
+  static int const kAlignmentC = GemmKernel::kAlignmentC;
+ 
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
-  static int const kAlignmentC = GemmKernel::kAlignmentC;
 
   using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
 
@@ -99,7 +176,12 @@ public:
 
   /// Helper to construct a transposed equivalent for the underying GEMM operator
   static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
+    if (kInternalTranspose) {
+      return args.transposed_problem();
+    }
+    else {
+      return args;
+    }
   }
 
   /// Determines whether the GEMM can execute the given problem.
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index de0ee183..18ccb346 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h
index 011e03c9..78d0a6da 100644
--- a/include/cutlass/gemm/gemm.h
+++ b/include/cutlass/gemm/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -400,7 +400,8 @@ enum class GemmUniversalMode {
   kGemm,
   kGemmSplitKParallel,
   kBatched,
-  kArray
+  kArray,
+  kInvalid
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h
index f3f6a149..0aba2d3a 100644
--- a/include/cutlass/gemm/kernel/default_gemm.h
+++ b/include/cutlass/gemm/kernel/default_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -49,6 +49,7 @@
 #include "cutlass/gemm/kernel/gemm_pipelined.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
 #include "cutlass/gemm/threadblock/default_mma.h"
 #include "cutlass/gemm/threadblock/default_mma_core_simt.h"
 #include "cutlass/gemm/threadblock/threadblock_swizzle.h"
@@ -116,6 +117,68 @@ template <
 struct DefaultGemm;
 
 ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Turing Architecture
 template <
   /// Element type for A matrix operand
@@ -201,6 +264,75 @@ struct DefaultGemm<
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Is Beta zero or not
+    bool IsBetaZero>
+struct DefaultGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
+    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, IsBetaZero> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
+      true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK,
+          IsBetaZero>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
 template <
     /// Element type for A matrix operand
@@ -439,6 +571,80 @@ struct DefaultGemm<
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Ampere
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultGemm<ElementA,
+                   LayoutA,
+                   kAlignmentA,
+                   ElementB,
+                   LayoutB,
+                   kAlignmentB,
+                   ElementC,
+                   layout::RowMajor,
+                   ElementAccumulator,
+                   arch::OpClassSimt,
+                   arch::Sm80,
+                   ThreadblockShape,
+                   WarpShape,
+                   GemmShape<1, 1, 1>,
+                   EpilogueOutputOp,
+                   ThreadblockSwizzle,
+                   Stages,
+                   SplitKSerial,
+                   Operator> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80,
+      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization for SIMT DP4A
 
@@ -516,7 +722,6 @@ struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
   using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
 };
 
-
 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 ////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization for Wmma Gemm Kernel
diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h
index a9ef4e31..15b1430c 100644
--- a/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ b/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -49,7 +49,9 @@
 #include "cutlass/gemm/kernel/gemm_pipelined.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
 #include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
 #include "cutlass/gemm/threadblock/default_mma_core_simt.h"
 #include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 #include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
@@ -101,6 +103,7 @@ template <
   /// Complex elementwise transformation on B operand
   ComplexTransform TransformB,
   /// Multiply-add operator 
+  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
   typename Operator,
   /// If true, kernel is configured to support serial reduction in the epilogue
   bool SplitKSerial
@@ -109,6 +112,64 @@ struct DefaultGemmComplex;
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace kernel
diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
index 3664fece..87008483 100644
--- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -49,6 +49,7 @@
 
 #include "cutlass/epilogue/threadblock/default_epilogue_planar_complex.h"
 #include "cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h" 
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -222,6 +223,122 @@ struct DefaultGemmPlanarComplexUniversal<
   
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for multiple pipeline stages.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+  >
+struct DefaultGemmPlanarComplexUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  typename std::enable_if<(Stages > 2)>::type 
+> {
+
+  /// Define planar complex valued variants instead
+  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexMultistage<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator
+  >::ThreadblockMma;
+
+  /// Planar complex epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
+    ThreadblockShape,
+    typename Mma::Policy::Operator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape::kK / WarpShape::kK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount  
+  >::Epilogue;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmPlanarComplex<
+    Mma,
+    Epilogue, 
+    ThreadblockSwizzle
+  >;
+
+  // Array variant
+  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
index f50ead04..e23965d3 100644
--- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h
index 23db577c..579005cb 100644
--- a/include/cutlass/gemm/kernel/default_gemm_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h
index 08a30790..36ae339c 100755
--- a/include/cutlass/gemm/kernel/default_gemv.h
+++ b/include/cutlass/gemm/kernel/default_gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h
index 36cf6731..6700659a 100644
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h
index 30ff1d30..f63571b0 100644
--- a/include/cutlass/gemm/kernel/gemm_array.h
+++ b/include/cutlass/gemm/kernel/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h
index 8bf4354a..eb638375 100644
--- a/include/cutlass/gemm/kernel/gemm_batched.h
+++ b/include/cutlass/gemm/kernel/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h
index 293592e7..6caa0eae 100644
--- a/include/cutlass/gemm/kernel/gemm_pipelined.h
+++ b/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h
index 3d975bb2..e0511256 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -421,6 +421,13 @@ public:
 
     cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();
 
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
     int offset_k = 0;
     int problem_size_k = params.problem_size.k();
 
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
index efb500b2..00841d46 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -377,6 +377,14 @@ public:
     ThreadblockSwizzle threadblock_swizzle;
 
     cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
     int batch_idx = threadblock_tile_offset.k();
 
     int problem_size_m = params.problem_size.m();
diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
index 2c5978aa..97389752 100644
--- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index 11831d8d..6efd50a7 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ public:
   using OperatorClass = typename Mma::Operator::OperatorClass;
   using ThreadblockShape = typename Mma::Shape;
   using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
   using ArchTag = typename Mma::ArchTag;
 
   static int const kStages = Mma::kStages;
@@ -259,9 +259,9 @@ public:
       Arguments const &args,
       void *workspace = nullptr) {
 
-      ptr_A = args.ptr_A;
-      ptr_B = args.ptr_B;
-      ptr_C = args.ptr_C;
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
       ptr_D = args.ptr_D;
 
       output_op = args.epilogue;
@@ -303,6 +303,10 @@ public:
     return Status::kSuccess;
   }
 
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
   /// Executes one GEMM
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h
index 852edde2..ea8d9bdf 100755
--- a/include/cutlass/gemm/kernel/gemv_batched_strided.h
+++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h
index 41ea8b49..15dfe433 100644
--- a/include/cutlass/gemm/thread/mma.h
+++ b/include/cutlass/gemm/thread/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h
index 78c77bef..04658f7b 100644
--- a/include/cutlass/gemm/thread/mma_sm50.h
+++ b/include/cutlass/gemm/thread/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h
index 66fed7e1..16d0d61c 100644
--- a/include/cutlass/gemm/thread/mma_sm60.h
+++ b/include/cutlass/gemm/thread/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h
index 13bbb542..83e31b23 100644
--- a/include/cutlass/gemm/thread/mma_sm61.h
+++ b/include/cutlass/gemm/thread/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h
index de234b85..9d692d6d 100755
--- a/include/cutlass/gemm/threadblock/default_gemv_core.h
+++ b/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h
index 11af1de4..3ebe14e6 100644
--- a/include/cutlass/gemm/threadblock/default_mma.h
+++ b/include/cutlass/gemm/threadblock/default_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -38,6 +38,8 @@
 #include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 #include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
 #endif //CUTLASS_ARCH_WMMA_ENABLED
@@ -203,6 +205,58 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
+                  kAlignmentB, float, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
+      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
+      arch::OpMultiplyAddFastF16>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, float,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Specialization for column-major-interleaved output
 template <
     /// Element type for A matrix operand
@@ -271,6 +325,214 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  Stages, Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Specialization for SIMT IDP4A Kernels
diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h
index f346709e..a7ac7c44 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -40,6 +40,8 @@
 #include "cutlass/gemm/warp/mma.h"
 #include "cutlass/gemm/threadblock/mma_pipelined.h"
 #include "cutlass/gemm/threadblock/mma_singlestage.h"
+#include "cutlass/arch/cache_operation.h" 
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -86,6 +88,17 @@ template <
     /// Store the accumulators in row major or column major.  Row major is used
     /// when output layout is interleaved.
     bool AccumulatorsInRowMajor = false
+    /// Cache operation of operand A
+    , cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
 >
 struct DefaultMmaCore;
 
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
index 9eaa6a7a..be501493 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm50.h b/include/cutlass/gemm/threadblock/default_mma_core_sm50.h
index 37aee476..782cd7ae 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm50.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
index a9ec80fd..30b3b3c0 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
index 51b5878f..e7a2adcb 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -598,6 +598,523 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+/// Below is for arch::OpMultiplyAddFastF16
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::ColumnMajor, float, layout::RowMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = float;
+  using LayoutB = layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  // Shared memory layout
+  using SmemLayoutB =
+      layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<half_t>::value,
+                                                    int(128 / sizeof(half_t))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::RowMajor, float, layout::ColumnMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::RowMajor;
+  using ElementB = float;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA =
+      layout::RowMajorTensorOpMultiplicandCrosswise<sizeof_bits<half_t>::value,
+                                                    Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::RowMajor, float, layout::RowMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::RowMajor;
+  using ElementB = float;
+  using LayoutB = layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t,
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::ColumnMajor, float, layout::ColumnMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = float;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, half_t, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, half_t, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                              WarpCount::kK>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization:
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
new file mode 100644
index 00000000..d9b3d9a0
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -0,0 +1,2130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::ColumnMajor, double, layout::ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::ColumnMajor, double, layout::RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::RowMajor, double, layout::ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::RowMajor, double, layout::RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float-precision
+///
+///   ElementA: complex<float>
+///   ElementB: complex<float>
+///   ElementC: complex<float>
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout for A operand
+    typename LayoutA_,
+    /// Layout for B operand
+    typename LayoutB_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA_,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB_
+    >
+struct DefaultMmaCore<
+  Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+  complex<float>, LayoutA_, 
+  complex<float>, LayoutB_, 
+  complex<float>, LayoutC_, 
+  arch::OpClassTensorOp, 
+  Stages, 
+  Operator_, 
+  false, 
+  CacheOpA, 
+  CacheOpB,
+  TransformA_, TransformB_, true> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = LayoutA_;
+  using ElementB = complex<float>;
+  using LayoutB = LayoutB_;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+  static const ComplexTransform TransformA = TransformA_;
+  static const ComplexTransform TransformB = TransformB_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  static_assert(
+    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value,
+    "The operator tag must indicate complex multiplication.");
+
+  //
+  // Underlying template
+  //
+
+  using MmaComplexCore = DefaultMultistageMmaComplexCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    arch::OpClassTensorOp,
+    kStages, 
+    TransformA,
+    TransformB,
+    Operator,
+    kCacheOpA,
+    kCacheOpB
+  >;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
+
+  // Shared memory layout
+  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   ElementA: complex<double>
+///   ElementB: complex<double>
+///   ElementC: complex<double>
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout for A operand
+    typename LayoutA_,
+    /// Layout for B operand
+    typename LayoutB_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA_,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB_
+    >
+struct DefaultMmaCore<
+  Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+  complex<double>, LayoutA_, 
+  complex<double>, LayoutB_, 
+  complex<double>, LayoutC_, 
+  arch::OpClassTensorOp, 
+  Stages, 
+  Operator_, 
+  false, 
+  CacheOpA, 
+  CacheOpB,
+  TransformA_, TransformB_, true> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = complex<double>;
+  using LayoutA = LayoutA_;
+  using ElementB = complex<double>;
+  using LayoutB = LayoutB_;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+  static const ComplexTransform TransformA = TransformA_;
+  static const ComplexTransform TransformB = TransformB_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  static_assert(
+    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value,
+    "The operator tag must indicate complex multiplication.");
+
+  //
+  // Underlying template
+  //
+
+  using MmaComplexCore = DefaultMultistageMmaComplexCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    arch::OpClassTensorOp,
+    kStages, 
+    TransformA,
+    TransformB,
+    Operator,
+    kCacheOpA,
+    kCacheOpB
+  >;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
+
+  // Shared memory layout
+  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major-interleaved
+///   B: row-major-interleaved
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Number of interleaved K
+    int InterleavedK>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
+                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      AccumulatorsInRowMajor, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static int const kInterleavedK = InterleavedK;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess =
+      kAccessSizeInBits / sizeof_bits<ElementA>::value;
+
+  static int const kWarpThreadArrangementContiguous =
+      kInterleavedK / kElementsPerAccess;
+
+  static int const kWarpThreadArrangementStrided =
+      kWarpSize / kWarpThreadArrangementContiguous;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, kInterleavedK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kInterleavedK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapA,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapB,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
index ef51be23..82144943 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
new file mode 100644
index 00000000..2f4a0796
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_multistage.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Math operator tag (e.g. arch::OpMultiplyAdd)
+    typename Operator = arch::OpMultiplyAdd
+>
+struct DefaultMmaPlanarComplexMultistage {
+
+    // Construct a planar complex variant from the real-valued variant
+    using RealMmaMultistage = typename DefaultMma<
+        ElementA_,
+        LayoutA_,
+        kAlignmentA,
+        ElementB_,
+        LayoutB_,
+        kAlignmentB,
+        ElementAccumulator_,
+        LayoutC_,
+        OperatorClass_,
+        ArchTag_,
+        ThreadblockShape_,
+        WarpShape_,
+        InstructionShape_,
+        Stages,
+        Operator
+    >::ThreadblockMma;
+
+    using ThreadblockMma = MmaPlanarComplexMultistage<
+      ThreadblockShape_,
+      typename RealMmaMultistage::IteratorA,
+      typename RealMmaMultistage::SmemIteratorA,
+      cutlass::arch::CacheOperation::Global,
+      typename RealMmaMultistage::IteratorB,
+      typename RealMmaMultistage::SmemIteratorB,
+      cutlass::arch::CacheOperation::Global,
+      ElementAccumulator_,
+      LayoutC_,
+      typename RealMmaMultistage::Policy,
+      Stages,
+      TransformA,
+      TransformB
+    >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}   // namespace threadblock
+}   // namespace gemm
+}   // namespace cutlass
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
new file mode 100644
index 00000000..7f3d534a
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMultistageMmaComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageMmaComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            ElementAccumulator, layout::RowMajor, OperatorClass,
+                            ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
new file mode 100644
index 00000000..613c88e3
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -0,0 +1,113 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from
+/// threadblock tile size, global memory data layout, and target math
+/// instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global>
+struct DefaultMultistageMmaComplexCore;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
new file mode 100644
index 00000000..230e8d76
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -0,0 +1,1113 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    complex<double>, layout::ColumnMajor,
+    complex<double>, layout::RowMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = complex<double>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    complex<double>, layout::ColumnMajor,
+    complex<double>, layout::ColumnMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages, 
+    TransformA, TransformB,
+    Operator_, 
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = complex<double>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  using Operator = Operator_;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    complex<double>, layout::RowMajor,
+    complex<double>, layout::ColumnMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_, 
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = complex<double>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+  
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,    
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    complex<double>, layout::RowMajor,
+    complex<double>, layout::RowMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages, 
+    TransformA, TransformB, 
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = complex<double>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+  
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for complex floating-point
+///
+///   A: column-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: MMA.1688.F32.TF32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::ColumnMajor,
+    complex<float>, layout::ColumnMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex floating-point
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: MMA.1688.F32.TF32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::ColumnMajor,
+    complex<float>, layout::RowMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: MMA.1688.F32.TF32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::RowMajor,
+    complex<float>, layout::ColumnMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+      
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: MMA.1688.F32.TF32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::RowMajor,
+    complex<float>, layout::RowMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+      
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h
deleted file mode 100755
index 54da93a9..00000000
--- a/include/cutlass/gemm/threadblock/gemv.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a threadblock-scoped GEMV kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix-vector product using SIMT math instructions.
-template <
-  class Core_ //< GemvCore
->
-class Gemv {
-public:
-  using Shape = typename Core_::Shape;
-
-  /// The MMA operator that computes GEMV 
-  using Operator = typename Core_::Operator;
-
-  /// Iterates over A in global memory
-  using IteratorA = typename Core_::IteratorA;
-
-  /// Iterates over B in global memory
-  using IteratorB = typename Core_::IteratorB;
-
-  /// Fragment of operand C loaded from global memory
-  using IteratorC = typename Core_::IteratorC;
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of operand accumulator loaded/stored to global memory
-  using FragmentC = typename Operator::FragmentC;
-
-  /// Shape of the per-thread GEMV operation
-  using ThreadShape = typename Core_::ThreadShape;
-
-public:
-  CUTLASS_DEVICE
-  Gemv() { }
-
-  CUTLASS_DEVICE
-  void operator()(
-    GemmCoord const &problem_size,    ///< problem size of batched GEMV
-    FragmentC &accum,                 ///< destination accumulator tile
-    IteratorA iterator_A,             ///< iterator over A operand in global memory
-    IteratorB iterator_B,             ///< iterator over B operand in global memory
-    FragmentC const &src_accum) {     ///< source accumualtor tile
-
-    //
-    // Prologue
-    //
-
-    FragmentA frag_A;
-    FragmentB frag_B;
-    frag_A.clear();
-    frag_B.clear();
-
-    iterator_A.load(frag_A);
-    iterator_B.load(frag_B);
-    ++iterator_A;
-    ++iterator_B;
-
-    //
-    // Mainloop
-    //
-    Operator thread_mma;
-    int gemm_k = problem_size.k();
-
-    if (gemm_k < Shape::kK)
-    {
-      iterator_A.clear_mask();
-      iterator_B.clear_mask();
-    }
-
-    // iterate over K to accumulate result
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k > 0; gemm_k -= Shape::kK) {
-      thread_mma(accum, frag_A, frag_B, accum);
-
-      iterator_A.load(frag_A);
-      iterator_B.load(frag_B);
-      ++iterator_A;
-      ++iterator_B;
-
-      if (gemm_k < Shape::kK)
-      {
-        iterator_A.clear_mask();
-        iterator_B.clear_mask();
-      }
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h
index 7e6d4fe6..dbf3d31f 100644
--- a/include/cutlass/gemm/threadblock/mma_base.h
+++ b/include/cutlass/gemm/threadblock/mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h
new file mode 100644
index 00000000..0431c306
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -0,0 +1,526 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      if (gemm_k_iterations == 0) {
+        iterator_A.clear_mask();
+        iterator_B.clear_mask();
+      }
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    if (gemm_k_iterations == 0) {
+      iterator_A.clear_mask();
+      iterator_B.clear_mask();
+    }
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          if (gemm_k_iterations == 0) {
+            iterator_A.clear_mask();
+            iterator_B.clear_mask();
+          }
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h
index 735950cf..80954f6c 100644
--- a/include/cutlass/gemm/threadblock/mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -75,7 +75,7 @@ template <
     typename IteratorA_::Element, 
     IteratorA_::Fragment::kElements>,
   ///
-  /// Transformation applied to A operand
+  /// Transformation applied to B operand
   typename TransformB_ = NumericArrayConverter<
     typename SmemIteratorB_::Element, 
     typename IteratorB_::Element, 
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
index 9491f56f..b37b4184 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
new file mode 100644
index 00000000..18e63b58
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -0,0 +1,642 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Transformation applied to A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Transformation applied to B
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplexMultistage : 
+  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Archtecture tag
+  using ArchTag = arch::Sm80;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Transformation applied to A
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B
+  static ComplexTransform const kTransformB = TransformB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = ArrayPlanarComplex<
+    typename Policy::Operator::FragmentC::Element,
+    Policy::Operator::FragmentC::kElements
+  >;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of LDGSTS instructions to load one stage of operand A
+    static int const TBLDGSTSIterationsA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of LDGSTS instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of LDGSTS instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (TBLDGSTSIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of LDGSTS instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (TBLDGSTSIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+    IteratorA &iterator_A_real,
+    IteratorA &iterator_A_imag,
+    
+    IteratorB &iterator_B_real, 
+    IteratorB &iterator_B_imag, 
+    
+    int group_start_A = 0, 
+    int group_start_B = 0) {
+
+    iterator_A_real.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // LDGSTS for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+        
+      typename IteratorA::AccessType *dst_ptr = 
+        reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+          
+      int const kSrcBytes = 
+        sizeof_bits<typename IteratorA::Element>::value * 
+        IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+
+        auto gmem_ptr_real = iterator_A_real.get();
+        auto gmem_ptr_imag = iterator_A_imag.get();
+
+        bool pred_guard = iterator_A_real.valid();
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+            dst_ptr + v,
+            gmem_ptr_real,
+            pred_guard);
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideA / IteratorA::ThreadMap::kElementsPerAccess),
+            reinterpret_cast<char const *>(gmem_ptr_imag),
+            pred_guard);
+
+        ++iterator_A_real;
+        ++iterator_A_imag;
+      }
+
+      ++this->smem_iterator_A_;
+    }
+
+    iterator_B_real.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      typename IteratorB::AccessType *dst_ptr = 
+        reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+      
+      int const kSrcBytes = 
+        sizeof_bits<typename IteratorB::Element>::value * 
+        IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+        auto gmem_ptr_real = iterator_B_real.get();
+        auto gmem_ptr_imag = iterator_B_imag.get();
+
+        bool pred_guard = iterator_B_real.valid();
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+            dst_ptr + v,
+            gmem_ptr_real,
+            pred_guard);
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideB / IteratorB::ThreadMap::kElementsPerAccess),
+            reinterpret_cast<char const *>(gmem_ptr_imag),
+            pred_guard);
+
+        ++iterator_B_real;
+        ++iterator_B_imag;
+      }
+      ++this->smem_iterator_B_;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void warp_mma_planar_complex(
+    Operator & warp_mma, 
+    FragmentC &accum,
+    WarpFragmentA const & real_A, 
+    WarpFragmentA const & imag_A, 
+    WarpFragmentB const & real_B, 
+    WarpFragmentB const & imag_B) {
+
+    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
+
+    WarpFragmentB neg_real_B = neg_op_B(real_B);
+    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
+
+    warp_mma(accum.real, real_A, real_B, accum.real);  
+
+    if (kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.imag, real_A, imag_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone) {
+      warp_mma(accum.imag, imag_A, real_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.real, imag_A, imag_B, accum.real);
+    }
+    else {
+      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
+    }
+  }
+
+public:
+  
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_real,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_imag,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_real,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_imag,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      if (gemm_k_iterations == 0) {
+        iterator_A_real.clear_mask();
+        iterator_A_imag.clear_mask();
+        iterator_B_real.clear_mask();
+        iterator_B_imag.clear_mask();
+      }
+
+      iterator_A_real.set_iteration_index(0);
+      iterator_A_imag.set_iteration_index(0);
+
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // LDGSTS for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr = 
+          reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+
+          int const kSrcBytes = 
+            sizeof_bits<typename IteratorA::Element>::value * 
+            IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          bool pred_guard = iterator_A_real.valid();
+
+          auto src_ptr_real = iterator_A_real.get();
+          auto src_ptr_imag = iterator_A_imag.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, src_ptr_real, pred_guard);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v +
+                  Base::SharedStorage::kImaginaryStrideA /
+                      IteratorA::ThreadMap::kElementsPerAccess,
+              reinterpret_cast<char const *>(src_ptr_imag),
+              pred_guard);
+
+          ++iterator_A_real;
+          ++iterator_A_imag;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B_real.set_iteration_index(0);
+      iterator_B_imag.set_iteration_index(0);
+
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr = 
+          reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+
+          int const kSrcBytes = 
+            sizeof_bits<typename IteratorB::Element>::value * 
+            IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          bool pred_guard = iterator_B_real.valid();
+
+          auto src_ptr_real = iterator_B_real.get();
+          auto src_ptr_imag = iterator_B_imag.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr + v, src_ptr_real, pred_guard);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v +
+                  Base::SharedStorage::kImaginaryStrideB /
+                      IteratorB::ThreadMap::kElementsPerAccess,
+              reinterpret_cast<char const *>(src_ptr_imag),
+              pred_guard);
+
+          ++iterator_B_real;
+          ++iterator_B_imag;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A_real.add_tile_offset({0, 1});
+      iterator_A_imag.add_tile_offset({0, 1});
+
+      iterator_B_real.add_tile_offset({1, 0});
+      iterator_B_imag.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a memory fence between stages of cp.async instructions
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Blocks until all but kStages-2 cp.async stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+
+    WarpFragmentA warp_frag_real_A[2];
+    WarpFragmentA warp_frag_imag_A[2];
+
+    WarpFragmentB warp_frag_real_B[2];
+    WarpFragmentB warp_frag_imag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
+    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
+
+    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
+    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    if (gemm_k_iterations == 0) {
+      iterator_A_real.clear_mask();
+      iterator_A_imag.clear_mask();
+      iterator_B_real.clear_mask();
+      iterator_B_imag.clear_mask();
+    }
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag);
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
+        
+        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        }
+        else {
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+    
+        copy_tiles_and_advance(
+          iterator_A_real, 
+          iterator_A_imag,
+          iterator_B_real, 
+          iterator_B_imag,
+          group_start_iteration_A, 
+          group_start_iteration_B);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a memory fence between stages of cp.async instructions
+          cutlass::arch::cp_async_fence();
+
+          // Blocks until all but kStages-2 cp.async stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A_real.add_tile_offset({0, 1});
+          iterator_A_imag.add_tile_offset({0, 1});
+          
+          iterator_B_real.add_tile_offset({1, 0});
+          iterator_B_imag.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          if (gemm_k_iterations == 0) {
+            iterator_A_real.clear_mask();
+            iterator_A_imag.clear_mask();
+            iterator_B_real.clear_mask();
+            iterator_B_imag.clear_mask();
+          }
+        }
+
+        warp_mma_planar_complex(
+          warp_mma, 
+          accum, 
+          warp_frag_real_A[warp_mma_k % 2], 
+          warp_frag_imag_A[warp_mma_k % 2],
+          warp_frag_real_B[warp_mma_k % 2], 
+          warp_frag_imag_B[warp_mma_k % 2]);
+      }
+
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h
index fd9890a4..32d4d4ee 100644
--- a/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ b/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 1beec2c2..03d71d31 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -99,61 +99,13 @@ int RematerializeBlockDimZ() {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Threadblock swizzling function for GEMMs
+template <int N = 1>
 struct GemmIdentityThreadblockSwizzle {
 
   CUTLASS_HOST_DEVICE
   GemmIdentityThreadblockSwizzle() { }
 
-  int const kTile = 1;
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
-    GemmCoord problem_size,
-    GemmCoord tile_size,
-    int split_k_slices) const {
-
-    return GemmCoord(
-      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      split_k_slices);
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
-    return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k());
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  GemmCoord get_tile_offset() const {
-
-    int block_idx_x = RematerializeBlockIdxX();
-    int block_idx_y = RematerializeBlockIdxY();
-
-    return GemmCoord{
-      (block_idx_x / kTile),
-      (block_idx_y * kTile) + (block_idx_x % kTile),
-      RematerializeBlockIdxZ()
-    };
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// A special version of GemmIdentityThreadblockSwizzle. See the choice of kTile below. 
-template <typename LayoutA_, typename LayoutB_>
-struct GemmCohortThreadblockSwizzle
-{
-  const int kTile =
-      (platform::is_same<LayoutA_, cutlass::layout::RowMajor>::value ||
-       platform::is_same<LayoutB_, cutlass::layout::ColumnMajor>::value)
-          ? 4
-          : 1;
-
-  CUTLASS_HOST_DEVICE
-  GemmCohortThreadblockSwizzle() { }
+  int const kTile = N;
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
@@ -271,8 +223,11 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Threadblock swizzling function for split-K GEMMs
+template <int N = 1>
 struct GemmSplitKIdentityThreadblockSwizzle {
 
+  int const kTile = N;
+
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
   GemmCoord get_tiled_shape(
@@ -289,16 +244,20 @@ struct GemmSplitKIdentityThreadblockSwizzle {
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(GemmCoord tiled_shape) const {
-    return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
+    return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k());
   }
 
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
   GemmCoord get_tile_offset() const {
+
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+
     return GemmCoord{
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxY(),
+      (block_idx_x / kTile),
+      (block_idx_y * kTile) + (block_idx_x % kTile),
       RematerializeBlockIdxZ()
     };
   }
diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
new file mode 100644
index 00000000..3c6772af
--- /dev/null
+++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op.h"
+#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Complex transform on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transform on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_ = arch::OpMultiplyAddComplex>
+struct DefaultMmaComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<T>*complex<T> case
+//  4 real-valued mma operations
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
+//  3 real-valued mma operations
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddGaussianComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use TF32 tensor operation internally
+//  4 real-valued MMA.1688.F32.TF32 operations on TF32 
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  // Complex floating point tensor operation use MMA.1688.F32.TF32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t,
+        cutlass::layout::RowMajor,
+        tfloat32_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use BF16 tensor operation internally
+//  4 real-valued MMA.1688.F32.BF16 operations on BF16
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddFastBF16> {
+
+  // Complex floating point tensor operation use MMA.1688.F32.BF16 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        bfloat16_t,
+        cutlass::layout::RowMajor,
+        bfloat16_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use F16 tensor operation internally
+//  4 real-valued MMA.1688.F32.F16 operations on F16
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddFastF16> {
+
+  // Complex floating point tensor operation use MMA.1688.F32.F16 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        half_t,
+        cutlass::layout::RowMajor,
+        half_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h
index f64f46f9..ea9ab5c9 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -60,10 +60,7 @@ template <
     int PartitionsK = 1,
     /// Store the accumulators in row major or column major.  Row major is used
     /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Number of partitions along N dimension per warp
-    int PartitionsN = 1
->
+    bool AccumulatorsInRowMajor = false>
 struct DefaultMmaTensorOp;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -92,9 +89,7 @@ template <
     int PartitionsK,
     /// Store the accumulators in row major or column major.  Row major is used
     /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor,
-    /// Number of partitions along N dimension per warp
-    int PartitionsN>
+    bool AccumulatorsInRowMajor>
 struct DefaultMmaTensorOp {
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
       cutlass::arch::Mma<InstructionShape_, 32, ElementA,
@@ -106,7 +101,7 @@ struct DefaultMmaTensorOp {
   // Define the warp-level tensor op
   using Type = cutlass::gemm::warp::MmaTensorOp<
       WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor, PartitionsN>;
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -117,3 +112,6 @@ struct DefaultMmaTensorOp {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+#include "default_mma_tensor_op_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
new file mode 100644
index 00000000..06d3afa5
--- /dev/null
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -0,0 +1,186 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses BF16 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 8>, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastBF16, 
+  PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses BF16 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 8>, 
+        32, 
+        bfloat16_t, cutlass::layout::RowMajor, 
+        bfloat16_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses F16 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 8>, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastF16, 
+  PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses F16 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 8>, 
+        32, 
+        half_t, cutlass::layout::RowMajor, 
+        half_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t, cutlass::layout::RowMajor, 
+        tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
index 11964944..582fb472 100644
--- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -61,9 +61,7 @@ template <
     /// Operator describing the tensor operation
     typename Operator_ = arch::OpMultiplyAdd,
     /// Number of partitions along K dimension
-    int PartitionsK = 1,
-    /// Number of partitions along N dimension per warp
-    int PartitionsN = 1
+    int PartitionsK = 1
 >
 struct DefaultMmaTensorOpWmma;
 
@@ -90,9 +88,7 @@ template <
     /// Operator describing the tensor operation
     typename Operator_,
     /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Number of partitions along N dimension per warp
-    int PartitionsN>
+    int PartitionsK>
 struct DefaultMmaTensorOpWmma {
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
       cutlass::arch::Wmma<
@@ -116,8 +112,7 @@ struct DefaultMmaTensorOpWmma {
         ElementC, 
         LayoutC, 
         Policy, 
-        PartitionsK,
-        PartitionsN>;
+        PartitionsK>;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -127,4 +122,3 @@ struct DefaultMmaTensorOpWmma {
 } // namespace cutlass
 
 #endif
-
diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h
index 5fb96d9f..16c736e2 100644
--- a/include/cutlass/gemm/warp/mma.h
+++ b/include/cutlass/gemm/warp/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
new file mode 100644
index 00000000..2dc72fd3
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -0,0 +1,843 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  /// Data type of real & imag members of complex numbers in the SourceFragment
+  typename RealElement,
+  /// Destination fragment required by the mma operation 
+  typename DestinationFragment,
+  /// Source fragment holding complex<RealElement> elements
+  typename SourceFragment,
+  /// Number of mma operations performed
+  typename MmaIterations,
+  /// Shape of operand elements
+  typename MmaOperandShape,
+  /// Complex transform on A operand
+  ComplexTransform Transform_,
+  /// Operand A or Operand B
+  Operand Operand_,
+  /// Floating-point rounding style
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma;
+
+// Partial specialization for OperandA and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kA,
+  Round_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kA;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRound = Round_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElement <= RealElement
+  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::ColumnMajor;
+  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMma() {}
+
+  CUTLASS_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kRow; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r + i * MmaOperandShape::kRow;
+          int col = c;
+
+          // Access complex<RealElement> and apply rounding on real and imag parts
+          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest[i][pos] = a;
+          dest[i+MmaIterations::kRow][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
+
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for OperandB and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kB,
+  Round_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kB;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRound = Round_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElement <= RealElement
+  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::RowMajor;
+  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMma() {}
+
+  CUTLASS_HOST_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kColumn; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r;
+          int col = c + i * MmaOperandShape::kColumn;
+
+          // Access complex<RealElement> apply rounding on real and imag parts
+          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest[i][pos] = a;
+          dest[i+MmaIterations::kColumn][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
+        }
+      }
+    }
+  }
+};
+} // namespace detail 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  Enable>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
+  using Policy = Policy_;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename Policy::Operator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % Policy::Operator::Shape::kM) && 
+    !(Shape::kN % Policy::Operator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / Policy::Operator::Shape::kM,
+    Shape::kN / Policy::Operator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename Policy::Operator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 2 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements,
+    "Unexpected planar complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  typename Policy::Operator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename Policy::Operator::FragmentA;
+    using MmaOperandB = typename Policy::Operator::FragmentB;
+    using MmaOperandC = typename Policy::Operator::FragmentC;
+
+    static_assert(MmaOperandA::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(MmaOperandB::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
+      "We can geneneralize later.");
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = A[m].real();
+        operand_B[0] = B[n].real();
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = A[m].real();
+        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        // A imaginary part is intentionally negated
+        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag());
+        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag());
+        operand_B[0] = B[n].real();
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    //TODO: Implement this
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
+//  Math instruction: MMA.1688.F32.TF32
+//  Output data type: complex<float>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<float>, 
+  LayoutA_, 
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  Enable>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = float;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = float;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = float;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+  
+  /// Shape of underlying instruction
+  using InstructionShape = typename Policy::Operator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename Policy::Operator::ElementA, FragmentA::kElements * 2>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename Policy::Operator::ElementB, FragmentB::kElements * 2>;
+
+  static_assert(
+    !(Shape::kM % Policy::Operator::Shape::kM) && 
+    !(Shape::kN % Policy::Operator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of complex products operations performed (one complex product needs four mma instructions)
+  using MmaIterations = MatrixShape<
+    Shape::kM / Policy::Operator::Shape::kM,
+    Shape::kN / Policy::Operator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename Policy::Operator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  typename Policy::Operator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using InstMmaOperandA = typename Policy::Operator::FragmentA;
+    using InstMmaOperandB = typename Policy::Operator::FragmentB;
+    using MmaOperandC = typename Policy::Operator::FragmentC;
+
+    static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename Policy::Operator::Shape>::value, 
+      "This implementation only supports MMA.1688 math instructions.");
+
+    static_assert(InstMmaOperandA::kElements == 4, 
+      "This implementation only supports math instructions in which exactly four element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(InstMmaOperandB::kElements == 2, 
+      "This implementation only supports math instructions in which exactly two element is needed for the B operand."
+      "We can geneneralize later.");
+
+    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
+    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&A);
+    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A[m], operand_B[n], *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
+      }
+
+      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // negate OperandB to accumulate  -(a.imag()*b.imag())
+        // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements
+        negate<InstMmaOperandB> negate_op;
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    // Alias types for underlying real-valued matrix multiply operator
+    using InstMmaOperandA = typename Policy::Operator::FragmentA;
+    using InstMmaOperandB = typename Policy::Operator::FragmentB;
+
+    //
+    // Define conversions from source type to instruction operands' type
+    //
+
+    FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; 
+    FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz;
+
+    detail::UnpackComplexConvertAndPackForMma <
+      RealElementA,
+      InstMmaOperandA,
+      FragmentA,
+      MmaIterations,
+      MatrixShape<2, 2>,
+      kTransformA,
+      Operand::kA,
+      kRoundA> convert_A;
+
+    detail::UnpackComplexConvertAndPackForMma <
+      RealElementB,
+      InstMmaOperandB,
+      FragmentB,
+      MmaIterations,
+      MatrixShape<2, 1>,
+      kTransformB,
+      Operand::kB,
+      kRoundB> convert_B;
+
+    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
+    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
+    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// TODO - partial specializations of real*complex and complex*real
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
new file mode 100644
index 00000000..b95af0df
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,2448 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 128b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 8) && !(Shape::kStrided % 4), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 1;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
+
+    int quad_pair = lane_id / 8;
+    int quad = lane_id / 4;
+    int lane = lane_id % 4;
+
+    int row = (quad & 1) * 4 + (lane ^ quad_pair);
+    
+    byte_offset_ = (row + quad_pair * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset =
+      (tile_offset.contiguous() * Shape::kContiguous) +
+      (tile_offset.strided() * InstructionShape::kStrided * stride_);
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kStrided;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Partial specialization for complex<T>
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of underlying field of reals.
+    typename RealElement,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = complex<RealElement>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
+  /// are stored in a planar complex arrangement with the real parts as entirely contiguous
+  /// followed by the imaginary parts.
+  using Fragment = Array<RealElement, Shape::kCount / kThreads * 2>;
+
+  static int const kRealIndex = 0;
+  static int const kImaginaryIndex = Shape::kCount / kThreads;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            Element z = offset_ref.at({accum_m, accum_n});
+
+            frag[mma_accum_start + row * kElementsPerAccess + col + kRealIndex] = z.real();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kImaginaryIndex] = z.imag();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            Element z(frag[kRealIndex + idx], frag[kImaginaryIndex + idx]);
+
+            offset_ref.at({accum_m, accum_n}) = z;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 128b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 8), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 1;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 8>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
+
+    int quad = lane_id / 4;
+    int liq = lane_id % 4;
+
+    int c = liq + (quad & 1) * 4;
+    int s = (quad / 2);
+
+    byte_offset_ = (c + s * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    // Compute the offset in units of elements. Note, the external coordinate system is
+    // approximately transposed with respect to the tiled internal structure
+    int offset =
+      (tile_offset.contiguous() * InstructionShape::kContiguous) * stride_ +
+      (tile_offset.strided() * Shape::kStrided);
+
+    add_pointer_offset(offset);
+
+    byte_offset_ ^= (tile_offset.contiguous() & 1) * 4 * sizeof(AccessType);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+
+    byte_offset_ ^= 4 * sizeof(AccessType);
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = s + c * Policy::Iterations::kStrided;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * InstructionShape::kContiguous * stride_ +
+        tile_offset.strided() * Shape::kStrided;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Congruous shared memory layout
+// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
+// The underlying iterators are similar to that for MMA f64*f64 + f64 = f64 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, cutlass::complex<float>,
+    cutlass::layout::TensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 8), "Divisibility.");
+
+  /// Element type
+  using Element = cutlass::complex<float>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / Policy::Delta::kContiguous;
+    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
+      access_contiguous + access_strided * stride_;
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset = 
+      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
+      tile_offset.contiguous() * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    
+    add_tile_offset({0, -1});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Crosswise shared memory layout
+// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
+// The underlying iterators are similar to that for f64*f64 + f64 = f64 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, complex<float>,
+    cutlass::layout::TensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
+
+  static_assert(sizeof_bits<complex<float>>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = complex<float>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 16>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter for tracking K-group
+  Index k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / 8;
+    int access_contiguous = (lane_id  % 8);
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset / kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
+                     stride_ * kElementsPerAccess +
+                 tile_offset.strided() * Shape::kStrided;
+
+    add_pointer_offset(offset);
+    
+    
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+    
+    // xor ptr
+    byte_offset_ ^= 0x40;
+
+    ++k_group_idx_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = c * Policy::Iterations::kStrided + s;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s / kElementsPerAccess;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+
+    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
+
+    // exchange on 64b granularity only for fragments held in k=8/2 to k=8 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Fragment::kElements/2; i < Fragment::kElements; i += 2) {
+      Element tmp = exchange_ptr[i];
+      exchange_ptr[i] = exchange_ptr[i + 1];
+      exchange_ptr[i + 1] = tmp;
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
new file mode 100644
index 00000000..bf3d98df
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -0,0 +1,357 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaGaussianComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaGaussianComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  Enable>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename Policy::Operator::Shape;
+
+  /// Underlying architecture tag
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % Policy::Operator::Shape::kM) && 
+    !(Shape::kN % Policy::Operator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / Policy::Operator::Shape::kM,
+    Shape::kN / Policy::Operator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename Policy::Operator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
+  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
+  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 3 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements,
+    "Unexpected gaussian complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  typename Policy::Operator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaGaussianComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename Policy::Operator::FragmentA;
+    using MmaOperandB = typename Policy::Operator::FragmentB;
+    using MmaOperandC = typename Policy::Operator::FragmentC;
+
+    static_assert(MmaOperandA::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(MmaOperandB::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
+      "We can geneneralize later.");
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Asum;
+        MmaOperandB operand_Br;
+
+        operand_Asum[0] = A[m].real() + ((kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag());
+        operand_Br[0] = B[n].real();
+
+        // accumulator part1
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_Asum, operand_Br, *accum);
+      }
+
+      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ar;
+        MmaOperandB operand_Bdiff;
+
+        operand_Ar[0] = -A[m].real();
+        operand_Bdiff[0] = B[n].real() - ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
+
+        // accumulator part2
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_Ar, operand_Bdiff, *accum);
+      }
+
+      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ai;
+        MmaOperandB operand_Bsum;
+
+        operand_Ai[0] = (kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag();
+        operand_Bsum[0] = B[n].real() + ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
+
+        // accumulator part3
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
+
+        mma(*accum, operand_Ai, operand_Bsum, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    //TODO: Implement this
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// TODO - partial specializations of real*complex and complex*real
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
new file mode 100644
index 00000000..8d9417b0
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpGaussianComplexAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Partial specialization for complex<T>
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of underlying field of reals.
+    typename RealElement,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpGaussianComplexAccumulatorTileIterator<
+    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = complex<RealElement>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
+  /// are stored in a gaussian complex arrangement with parts 1, 2, and 3 as entirely contiguous
+  /// arranged as [part1, part2, part3]
+  using Fragment = Array<RealElement, (Shape::kCount / kThreads) * 3>;
+
+  static int const kPart1Index = (Shape::kCount / kThreads) * 0;
+  static int const kPart2Index = (Shape::kCount / kThreads) * 1;
+  static int const kPart3Index = (Shape::kCount / kThreads) * 2;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            Element z = offset_ref.at({accum_m, accum_n});
+
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart1Index] = z.real() + z.imag();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart2Index] = -z.real();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart3Index] = z.imag();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            Element z(frag[kPart1Index + idx] - frag[kPart3Index + idx], 
+                      frag[kPart1Index + idx] + frag[kPart2Index + idx]);
+
+            offset_ref.at({accum_m, accum_n}) = z;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h
index 9166fe7c..1bf23c74 100644
--- a/include/cutlass/gemm/warp/mma_simt.h
+++ b/include/cutlass/gemm/warp/mma_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -147,6 +147,9 @@ public:
     dp4a_type
   >;
 
+  /// Shape of the underlying instruction
+  using InstructionShape = GemmShape<1,1,use_dp4a ? 4 : 1>;
+
 public:
 
   /// Iterates over the A operand in memory
diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h
index 78247433..6abd0bf6 100644
--- a/include/cutlass/gemm/warp/mma_simt_policy.h
+++ b/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
index 1d47e8f1..ed1e5987 100644
--- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index 4e082db1..3eff7b90 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -39,12 +39,16 @@
 
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/warp/mma.h"
 
 #include "cutlass/gemm/warp/mma_tensor_op_policy.h"
 
 #include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -77,6 +81,27 @@ struct ConvertAndPack<T, T, N, Round> {
   }
 };
 
+template <int N, FloatRoundStyle Round>
+struct ConvertAndPack<bfloat16_t, float, N, Round> {
+
+  using Converter = NumericArrayConverter<bfloat16_t, float, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(Array<float, N> const &source) {
+    Converter converter;
+
+    Array<float, N> tmp;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+      tmp[i] = source[idx];
+    }
+
+    return converter(tmp);
+  }
+};
+
 template <int N, FloatRoundStyle Round>
 struct ConvertAndPack<half_t, float, N, Round> {
 
@@ -130,8 +155,6 @@ template <
   /// Store the accumulators in row major or column major.  Row major is used
   /// when output layout is interleaved.
   bool AccumulatorsInRowMajor = false,
-  /// PartitionsN indicating how many PartitionsN for multiplicand B
-  int PartitionsN_ = 1,
   /// Used for partial specialization
   typename Enable = bool
 >
@@ -167,6 +190,9 @@ public:
   /// Indicates class of matrix operator
   using OperatorClass = arch::OpClassTensorOp;
 
+  /// Shape of underlying instruction
+  using InstructionShape = typename Policy::Operator::Shape;
+
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
 
@@ -179,9 +205,6 @@ public:
   /// Number of partitions along K dimension
   static int const kPartitionsK = PartitionsK_;
 
-  /// PartitionsN indicating how many PartitionsN for multiplicand B
-  static int const kPartitionsN = PartitionsN_;
-
 public:
 
   /// Iterates over the A operand in memory
@@ -228,9 +251,7 @@ private:
   /// Number of mma operations performed
   using MmaIterations = MatrixShape<
     Shape::kM / Policy::Operator::Shape::kM,
-    (Shape::kN / Policy::Operator::Shape::kN / kPartitionsN > 0) ?
-     Shape::kN / Policy::Operator::Shape::kN / kPartitionsN :
-      1
+    Shape::kN / Policy::Operator::Shape::kN
   >;
 
 public:
@@ -254,8 +275,8 @@ public:
     FragmentC &D, 
     TransformedFragmentA const &A, 
     TransformedFragmentB const &B, 
-    FragmentC const &C,
-    int const &partitionN_idx = 0) const {
+    FragmentC const &C
+  ) const {
 
     using MmaOperandA = typename Policy::Operator::FragmentA;
     using MmaOperandB = typename Policy::Operator::FragmentB;
@@ -267,8 +288,7 @@ public:
     MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
     MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
 
-      // The offset of multilicand B for current partition
-      const int n_off = partitionN_idx * FragmentB::kElements / MmaOperandB::kElements / kPartitionsN;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
       // Serpentine visitation order maximizing reuse of Rb
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < MmaIterations::kColumn; ++n) {
@@ -286,24 +306,46 @@ public:
               ptr_D[n + m_serpentine * MmaIterations::kColumn]);
           } else {
             mma(
-              ptr_D[m_serpentine + (n + n_off) * MmaIterations::kRow],
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
               ptr_A[m_serpentine],
-              ptr_B[n + n_off],
-              ptr_D[m_serpentine + (n + n_off) * MmaIterations::kRow]);
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
           }
         }
       }
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      // Serpentine visitation order maximizing reuse of Ra
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        }
+      }
+    #else
+      assert(0);
+    #endif
   }
 
   /// Transform the mma operands to the required types
   CUTLASS_DEVICE
   void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
                  FragmentA const &A, FragmentB const &B) const {
-    bool midway_depstage =
-        !(platform::is_same<typename Policy::Operator::ElementA,
-                           ElementA>::value &&
-         platform::is_same<typename Policy::Operator::ElementB,
-                           ElementB>::value);
 
     //
     // Define conversions from source type to instruction type
@@ -314,6 +356,7 @@ public:
     FloatRoundStyle const kRoundB =
         PreferredRoundingMode<typename Policy::Operator::ElementB,
                               ElementB>::kRound;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
       detail::ConvertAndPack<typename Policy::Operator::ElementA, ElementA,
                             FragmentA::kElements, kRoundA>
           convert_A;
@@ -331,6 +374,26 @@ public:
       ptr_dst_B[0] = convert_B(ptr_B[0]);
       ptr_dst_B[1] = convert_B(ptr_B[1]);
 
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      detail::ConvertAndPack<typename Policy::Operator::ElementA, ElementA,
+                            FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename Policy::Operator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename Policy::Operator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename Policy::Operator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+  
+      dst_B = convert_B(B);
+  
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    #else
+      assert(0);
+    #endif
   }
 };
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
new file mode 100644
index 00000000..85f5009d
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,428 @@
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of a warp tile
+      that participate in one warp-level mma operation.
+
+      Typically, this is used to access the accumulator tile/fragement of a warp-level mma operation.
+      The accumulator tile is then partitioned into smaller tiles/fragments that can be fed into 
+      next warp-level mma operation. 
+
+      This iterator is necessary to accomplish warp-level mma fusion where the accumulator tile is 
+      reused as multiplicand tile for the next mma.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of the accumulation tile shape (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on the fragment
+    typename OutputOp_,
+    /// Whether beta is zero
+    bool IsBetaZero_ >
+class MmaTensorOpFragmentIterator;
+
+
+// Partial specialization for col-major accumulator tile
+// And Element type is the same as Accumulator Element type
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Element_, Element_,
+                                         cutlass::layout::ColumnMajor,
+                                         InstructionShape_, OutputOp_, true> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Output operation on fragment
+  using OutputOp = OutputOp_;
+
+  /// Whether beta is zero
+  static bool const IsBetaZero = true;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        !(AccumulatorShape::kRow % Shape::kRow) &&
+            !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<Element, AccumulatorShape::kCount / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    AccessType src_fragment;
+    src_fragment.clear();
+
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
+    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow 
+                    * MmaIterations::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        int accumulator_access_offset = 
+            (n + index_n) * AccumulatorIterations::kRow + m + index_m;
+            
+        frag_ptr[n * MmaIterations::kRow + m].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+            //frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset];
+            frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment);
+      }
+    }
+  }
+
+};
+
+// Partial specialization for row-major accumulator tile
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
+                                         cutlass::layout::RowMajor,
+                                         InstructionShape_, OutputOp_, true> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Accumulator Element type
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Output operation on fragment
+  using OutputOp = OutputOp_;
+
+  /// Whether beta is zero
+  static bool const IsBetaZero = true;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        !(AccumulatorShape::kRow % Shape::kRow) &&
+            !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType src_fragment;
+    src_fragment.clear();
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+//    NumericArrayConverter<Element, ElementAccumulator, kElementsPerAccess, FloatRoundStyle::round_indeterminate> fragmentConverter;
+
+    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
+    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow 
+                    * MmaIterations::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; m++) {
+      for (int n = 0; n < MmaIterations::kColumn; n++) {
+        int accumulator_access_offset = 
+            (m + index_m) * AccumulatorIterations::kColumn + n + index_n;
+
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+//            frag_ptr[m * MmaIterations::kColumn + n] = fragmentConverter(accumulators_[accumulator_access_offset]);
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset], src_fragment);
+      }
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
index 82386011..68b28bff 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
index 59515b5b..063c77f9 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -106,6 +106,9 @@ public:
   /// Architecture tag
   using ArchTag = arch::Sm70;
 
+  /// Underlying instruction shape
+  using InstructionShape = typename Policy::Operator::Shape;
+
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
 
@@ -210,8 +213,7 @@ public:
     FragmentC &D, 
     FragmentA const &A, 
     FragmentB const &B, 
-    FragmentC const &C,
-    int const &partitionN_idx = 0)  {
+    FragmentC const &C)  {
 
     using MmaOperandA = typename Policy::Operator::FragmentA;
     using MmaOperandB = typename Policy::Operator::FragmentB;
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
index 45048d38..1a8fa4f9 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -229,8 +229,11 @@ public:
     k_group_idx_(0) {
       
     int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
     int lane_in_quad = (lane_id & 3);
     int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPointerCount; ++i) {
       int partition_contiguous_idx = -1;
@@ -242,6 +245,24 @@ public:
         access_contiguous_idx = (quad_pair ^ lane_in_quad);
         access_strided_idx = lane_in_quad_pair;
       }
+      else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
+        access_contiguous_idx =
+            (((quad_pair & 1) + ((i & 1) << 1)) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
+        access_contiguous_idx = ((quad_quad + ((i & 1) << 1)) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_quad;
+      }
       int access_contiguous =
           partition_contiguous_idx * Layout::PartitionShape::kContiguous +
           access_contiguous_idx;
@@ -436,6 +457,364 @@ public:
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It
+/// uses LDS.32 to load from shared memory and therefore must be initialized
+/// with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_,
+    OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDS.32
+    // op.  Every one warp of LDS.32 loads 8x4 elements
+    static int const kLdsOpInner = Layout::TileShape::kStrided;
+    static int const kLdsOpOuter = kThreads / kLdsOpInner;
+
+    static_assert(!(Shape::kContiguous % kLdsOpOuter),
+                  "Shape of warp-level mma must be divisible by LDS.32's "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsOpInner),
+                  "Shape of warp-level mma must be divisible by LDS.32's "
+                  "fundamental tile size.");
+
+    /// Number of LDS.32 instructions needed by one MMA instruction
+    /// 1684 A 2x1
+    /// 1684 B 1x1
+    /// 1688 A 2x2
+    /// 1688 B 1x2
+    static int const LdsShapeContiguous =
+        InstructionShape::kContiguous / kLdsOpOuter;
+    static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner;
+    using LdsShape =
+        layout::PitchLinearShape<LdsShapeContiguous, LdsShapeStrided>;
+
+    /// Number and arrangement of LDS instructions
+    using LdsIterations = layout::PitchLinearShape<
+        Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount = Layout::TileShape::kContiguous *
+                                   Layout::kElementsPerAccess /
+                                   Policy::kLdsOpOuter;
+
+  /// Vectorized access is not used
+  static int const kElementsPerAccess = 1;
+
+  /// Pointer type used for accesses
+  using AccessType = Element;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+ private:
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int access_strided = lane_id % Policy::kLdsOpInner;
+      int access_contiguous = (lane_id / Policy::kLdsOpInner) +
+                              (access_strided ^ i) * Policy::kLdsOpOuter;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Element *fetch_ptr = reinterpret_cast<Element *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) {
+            int access_idx =
+                cc + (ss + (c + s * Policy::LdsIterations::kContiguous) *
+                               Policy::LdsShape::kStrided) *
+                         Policy::LdsShape::kContiguous;
+            int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous;
+            int access_idx_strided =
+                (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner;
+
+            AccessType const *source_ptr =
+                pointer_[access_idx_contiguous % kPointerCount] +
+                Layout::TileShape::kContiguous * Layout::kElementsPerAccess *
+                    (access_idx_contiguous / kPointerCount) +
+                access_idx_strided * stride_;
+
+            char const *source_byte_ptr =
+                reinterpret_cast<char const *>(source_ptr) + byte_offset +
+                byte_offset_;
+
+            fetch_ptr[access_idx] =
+                *reinterpret_cast<Element const *>(source_byte_ptr);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
 /// memory and therefore must be initialized with a TensorRef to shared memory. 
 ///
@@ -1069,7 +1448,6 @@ class MmaTensorOpMultiplicandTileIterator<
         k_group_idx_(0) {
     // Warp level iterator at most use double buffer to hide latency.  If there
     // are more than 2 sections, every stage should have more than 1 section.
-    // TODO: refactor code after every case is implemented
 
     // Turing silicon requires all 32 threads in a warp provide valid addresses
     // even for LDSM.1 and LDSM.2
@@ -1077,6 +1455,8 @@ class MmaTensorOpMultiplicandTileIterator<
     lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
 #endif
 
+    int quad_quad = (lane_id >> 4);
+    int quad_pair = (lane_id >> 3);
     int lane_in_pair = (lane_id & 1);
     int lane_in_quad = (lane_id & 3);
     int lane_in_quad_pair = (lane_id & 7);
@@ -1100,6 +1480,26 @@ class MmaTensorOpMultiplicandTileIterator<
                                  (lane_in_quad_quad / Layout::kFactor));
         access_strided_idx = lane_id / Layout::kFactor;
       }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Integer matrix multiply 16832 A
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
+        access_contiguous_idx =
+            ((lane_in_pair * factor_in_partition + quad_quad) ^
+             access_strided_idx);
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Integer matrix multiply 16832 B
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
+        access_contiguous_idx =
+            ((lane_in_pair * factor_in_partition + ((lane_id & 8) >> 3)) ^
+             access_strided_idx);
+      }
     } else if (Layout::kFactor == 2) {
       // Super Matrix multiply kBlock = 32
       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
@@ -1113,6 +1513,28 @@ class MmaTensorOpMultiplicandTileIterator<
         access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
         access_strided_idx = lane_id / Layout::kFactor;
       }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816|1688.TF32 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (quad_quad ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816|1688.TF32 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            ((quad_pair & 1) ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx =
+            (lane_in_quad_pair + (lane_id >> 4 << 3)) / Layout::kFactor;
+      } 
     } else if (Layout::kFactor == 1) {
       // Super Matrix multiply kBlock = 64
       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
@@ -1124,6 +1546,25 @@ class MmaTensorOpMultiplicandTileIterator<
         access_contiguous_idx = lane_in_quad;
         access_strided_idx = lane_id;
       }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816|1688.TF32 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = (quad_quad ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_quad;
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816|1688.TF32 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = ((quad_pair & 1) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
+      } 
     }
 
     int access_contiguous =
@@ -1161,16 +1602,68 @@ class MmaTensorOpMultiplicandTileIterator<
     return *this;
   }
 
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+    if (k_groups_delta < 0) {
+        whole_tiles -= 1;
+        k_groups_delta += Policy::kGroupsPerTile;
+    }
+
+    if ((Policy::kGroupsPerTile / kPartitionsK) >= 2) {
+      byte_offset_ ^= (k_groups_delta & 1) * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+    if ((Policy::kGroupsPerTile / kPartitionsK) >= 4) {
+      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 1)) & 2) * 
+                        Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+    if ((Policy::kGroupsPerTile / kPartitionsK) == 8) {
+      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 3)) & 4) * 
+                        Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+
+    k_group_idx_ += k_groups_delta;
+    whole_tiles += k_group_idx_ / (Policy::kGroupsPerTile / kPartitionsK);
+    k_group_idx_ = k_group_idx_ % (Policy::kGroupsPerTile / kPartitionsK);
+
+    pointer_ +=
+        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
+        whole_tiles * stride_ / sections_;
+    return *this;
+  }
+
   /// Advances the iterator along the advance dimension
   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator++() {
 
+    // Integer matrix multiply 16832 Interleaved-32
+    //   NONE
+    // Integer matrix multiply 16816 Interleaved-32 || Integer matrix multiply 16816 kblock=32
+
     // Integer matrix multiply 8816  Interleaved-32
     //   ^1 ^1
+    // Matrix multiply 1684.TF32 kblock=16 || Integer matrix multiply 16816 kblock=64
     // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64
     //   ^1 ^3 ^1 ^3
     // Matrix multiply 1688 kblock=64
     //   ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7
+
+    // Matrix multiply 16816 kblock=32 | 1688.TF32 kblock=16 || Integer matrix multiply 16832 kblock=64
+    //   ^2 ^2
+    // Matrix multiply 16816 kblock=64 | 1688.TF32 kblock=32 || Integer matrix multiply 16832 kblock=128
+    //   ^2 ^6 ^2 ^6
+
     if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
       int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
                      ? 3
@@ -1443,6 +1936,16 @@ class MmaTensorOpMultiplicandTileIterator<
     return *this;
   }
 
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
   /// Advances the iterator along the advance dimension
   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator++() {
@@ -1673,6 +2176,16 @@ class MmaTensorOpMultiplicandTileIterator<
     return *this;
   }
 
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
   /// Advances the iterator along the advance dimension
   CUTLASS_HOST_DEVICE
   MmaTensorOpMultiplicandTileIterator &operator++() {
@@ -1782,6 +2295,7 @@ class MmaTensorOpMultiplicandTileIterator<
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
 template <
     /// Size of the matrix to load (concept: MatrixShape)
     typename Shape_,
@@ -2682,6 +3196,7 @@ public:
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
 } // namespace warp
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index 51c5ce26..ed6384f0 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
new file mode 100644
index 00000000..e43373b6
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,1579 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 4), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / Policy::Delta::kContiguous;
+    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
+      access_contiguous + access_strided * stride_;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset = 
+      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
+      tile_offset.contiguous() * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    
+    add_tile_offset({0, -1});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 16>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter for tracking K-group
+  Index k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / 8;
+    int access_contiguous = (lane_id  % 8);
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset / kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
+                     stride_ * kElementsPerAccess +
+                 tile_offset.strided() * Shape::kStrided;
+
+    add_pointer_offset(offset);
+    
+    int old_k_group_idx = k_group_idx_;
+
+    k_group_idx_ += tile_offset.contiguous();
+
+    if ((k_group_idx_ & 2) ^ (old_k_group_idx & 2)) {
+      byte_offset_ ^= 0x40;
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+
+    if (k_group_idx_ & 0x1) {
+      // xor ptr
+      byte_offset_ ^= 0x40;
+    }
+
+    ++k_group_idx_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s / kElementsPerAccess;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+
+    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
+
+    if (k_group_idx_ & 1) {
+      // exchange on 64b granularity
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Fragment::kElements; i += 2) {
+        Element tmp = exchange_ptr[i];
+        exchange_ptr[i] = exchange_ptr[i + 1];
+        exchange_ptr[i + 1] = tmp;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    // TODO
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
index 0caf6247..64be6556 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
index fe69867e..824e207d 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -40,6 +40,8 @@
 
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/warp/mma.h"
 
@@ -75,8 +77,6 @@ template <
   typename Policy_,
   ///< Number of partitions along K dimension
   int PartitionsK_ = 1,
-  ///< Number of partitions along N dimension
-  int PartitionsN_ = 1,
   ///< Used for partial specialization
   typename Enable = bool
 >
@@ -106,6 +106,9 @@ public:
   /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
   using Policy = Policy_;
 
+  /// Underlying instruction shape
+  using InstructionShape = typename Policy::Operator::Shape;
+
   /// Underlying architecture tag
   using ArchTag = typename Policy::Operator::ArchTag;
 
@@ -116,7 +119,7 @@ public:
   static ComplexTransform const kTransformB = ComplexTransform::kNone;
 
   /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
 
   /// Number of threads participating in warp-level matrix product
   static int const kThreadCount = 32;
@@ -124,9 +127,6 @@ public:
   /// Number of partitions along K dimension
   static int const kPartitionsK = PartitionsK_;
 
-  /// PartitionsN indicating how many PartitionsN for multiplicand B
-  static int const kPartitionsN = PartitionsN_;
-
 public:
 
   /// Iterates over the A operand in memory
@@ -163,9 +163,7 @@ private:
   /// Number of wmma operations performed
   using WmmaIterations = MatrixShape<
     Shape::kM / Policy::Operator::Shape::kM,
-    (Shape::kN / Policy::Operator::Shape::kN / kPartitionsN > 0) ?
-     Shape::kN / Policy::Operator::Shape::kN / kPartitionsN :
-      1
+    Shape::kN / Policy::Operator::Shape::kN 
   >;
 
 public:
@@ -189,8 +187,7 @@ public:
     FragmentC &D, 
     FragmentA const &A, 
     FragmentB const &B, 
-    FragmentC const &C,
-    int const &partitionN_idx = 0) const {
+    FragmentC const &C) const {
 
     CUTLASS_PRAGMA_UNROLL
     for (int n = 0; n < WmmaIterations::kColumn; ++n) {
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index 8ac08722..10d00de1 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index f6951769..6b97f822 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h
index b48fd7d0..bd84a357 100644
--- a/include/cutlass/kernel_launch.h
+++ b/include/cutlass/kernel_launch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h
index ba540e77..775357d1 100644
--- a/include/cutlass/layout/layout.h
+++ b/include/cutlass/layout/layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h
index 2ab907a5..7c02f8f2 100644
--- a/include/cutlass/layout/matrix.h
+++ b/include/cutlass/layout/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h
index 987c2bb8..a6158b32 100644
--- a/include/cutlass/layout/pitch_linear.h
+++ b/include/cutlass/layout/pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index 2ef4e9d2..20d5bad7 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
index 26bd427e..03f87db3 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
index b4b35667..00870fb5 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
new file mode 100644
index 00000000..e5963a2a
--- /dev/null
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -0,0 +1,1133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCongruous64b {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 64;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous64b(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous64b(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous64b(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    int tc = coord.contiguous() / 16;
+    int ts = coord.strided() / 4;
+
+    int c = coord.contiguous() % 16;
+    int s = coord.strided() % 4;
+
+
+    int bank = ((((c & 1) * 4 + (c & 6) / 2)) ^ (s & 1)) * 2 + (c / 8);
+    int row = (c & 6) / 2;
+
+    bank ^= ((s & 2) * 2);
+
+    LongIndex offset = tc * 16 + bank + (ts * 4 + row) * stride_[0];
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCongruous64b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous64b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous64b(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCongruous64b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous64b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous64b(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicand64bCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 64;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand64bCrosswise(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand64bCrosswise(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return TensorOpMultiplicand64bCrosswise(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    int tc = coord.contiguous() / 16;
+    int ts = coord.strided() / 16;
+
+    int c = coord.contiguous() % 16;
+    int s = coord.strided() % 16;
+
+    int k_group = c / 4;
+    int access_s = s / 2;
+
+    int row = access_s % 4;
+    int bank = ((k_group & 2) << 2) ^ ((s % 2) << 3) + (c % 4) * 2 + (access_s / 4) ^ (k_group & 1);
+
+    int smem_row = (k_group * 4 + row) + tc * 16;
+    int smem_col = ts * 16 + bank;
+
+    LongIndex offset = smem_row * stride_[0] + smem_col;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct ColumnMajorTensorOpMultiplicand64bCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand64bCrosswise;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicand64bCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct RowMajorTensorOpMultiplicand64bCrosswise {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand64bCrosswise;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicand64bCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCongruous128b {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 128;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous128b(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous128b(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous128b(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index tc = coord.contiguous() / 8;
+    Index ts = coord.strided() / 4;
+
+    Index c = coord.contiguous() % 8;
+    Index s = coord.strided() % 4;
+
+    Index k_index = (c / 2);
+
+    Index bank = (((c & 1) * 4) | (s ^ k_index));
+
+    LongIndex offset = tc * 8 + bank + (ts * 4 + k_index) * stride_[0];
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();   
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCongruous128b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous128b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous128b(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCongruous128b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous128b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous128b(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCrosswise128x4 {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 128;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise128x4(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise128x4(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCrosswise128x4(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index tc = coord.contiguous() / 8;
+    Index ts = coord.strided() / 8;
+
+    Index c = coord.contiguous() % 8;
+    Index s = coord.strided() % 8;
+
+    Index liq = c % 4;
+
+    Index bank = liq + ((s & 1) * 4) ^ (c & 4);
+
+    Index k_index = (c & 4) + (s / 4) * 2 + ((s & 2) / 2);
+
+    LongIndex offset = (tc * 8 + k_index) * stride_[0] + ts * 8 + bank;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCrosswise128x4 {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise128x4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCrosswise128x4(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCrosswise128x4 {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise128x4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCrosswise128x4(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h
index 0700e587..b54b6b3b 100644
--- a/include/cutlass/layout/vector.h
+++ b/include/cutlass/layout/vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h
index 8ba61a5e..b432665e 100644
--- a/include/cutlass/matrix_coord.h
+++ b/include/cutlass/matrix_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h
index 1d0b4820..cb3118c2 100644
--- a/include/cutlass/matrix_shape.h
+++ b/include/cutlass/matrix_shape.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/matrix_traits.h b/include/cutlass/matrix_traits.h
index 8e7fe330..cf7002a4 100644
--- a/include/cutlass/matrix_traits.h
+++ b/include/cutlass/matrix_traits.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index ef4604cb..78181ce7 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -45,7 +45,9 @@ enum class FloatRoundStyle {
   round_toward_zero,            ///< round toward zero
   round_to_nearest,             ///< round to nearest even
   round_toward_infinity,        ///< round toward infinity
-  round_toward_neg_infinity     ///< round toward negative infinity
+  round_toward_neg_infinity,    ///< round toward negative infinity
+  round_half_ulp_truncate,      ///< add 0.5ulp to integer representation then round toward zero
+  round_half_ulp_trunc_dntz     ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -240,6 +242,232 @@ struct NumericConverter<half_t, float, FloatRoundStyle::round_toward_zero> {
   }
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> bfloat16_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= bfloat16_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, bfloat16_t, Round> {
+
+  using result_type = float;
+  using source_type = bfloat16_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<float>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return static_cast<bfloat16_t>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x32 += 0x8000;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x32 += 0x8000;
+    }
+    #endif
+
+    uint16_t x16 = uint16_t((x32 >> 16) & 0xffff);
+    return bfloat16_t::bitcast(x16);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+    uint16_t x16 = uint16_t(x32 >> 16);
+
+    return bfloat16_t::bitcast(x16);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> tfloat32_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= tfloat32_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, tfloat32_t, Round> {
+
+  using result_type = float;
+  using source_type = tfloat32_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<float>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    unsigned storage = reinterpret_cast<unsigned const &>(s);
+
+    if ((storage & 0x7f800000) != 0x7f800000) {
+
+      bool mantissa_bit = ((storage & (1 << 13)) != 0);
+      bool round_bit = ((storage & (1 << 12)) != 0);
+      bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0);
+
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        storage += uint32_t(1 << 13);
+      }
+
+      // Note, the following is intentionally commented out. TF32
+      // does not define the low order bits, so they may be left in
+      // an undefined state. 
+      //
+      // By not truncating these bit explicitly, we avoid an extra logical
+      // operation.
+      //
+      // TF32 may be implicitly converted to float by performing this
+      // operation as needed.
+      //
+      // storage = (storage & ~0x1fff);
+    }
+    else if (storage & ~0xff800000) {
+      storage = 0x7fffffff;
+    }
+
+    return tfloat32_t::bitcast(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return tfloat32_t::round_half_ulp_truncate(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+/// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero.
+/// It avoids predicated code, though it requires a temporary register.
+template <>
+struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc_dntz> {
+  using result_type = tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    unsigned y = reinterpret_cast<unsigned const &>(s);
+    y = y & 0xff800000;
+    float d = reinterpret_cast<float const &>(y);
+    float z = d / float(1 << 11) + s;
+
+    return reinterpret_cast<result_type const &>(z);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    uint32_t x = reinterpret_cast<uint32_t const &>(s);
+    return tfloat32_t::bitcast(x & 0xffffe000);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Conversion and Clamp operator for Integers
@@ -518,6 +746,77 @@ struct NumericArrayConverter<float, half_t, N, Round> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float, 2>, round to nearest
+template <>
+struct NumericArrayConverter<bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = Array<bfloat16_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned d;
+
+    asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) );
+
+    return reinterpret_cast<result_type const &>(d);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<bfloat16_t, float, N, Round> {
+
+  using result_type = Array<bfloat16_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<bfloat16_t, float, 2, Round> convert_vector_;
+    NumericConverter<bfloat16_t, float, Round> convert_element_;
+
+    result_type result;
+
+    Array<bfloat16_t, 2> *result_ptr = reinterpret_cast<Array<bfloat16_t, 2> *>(&result);
+    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+#endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Conditional guards to enable partial specialization for packed integers 
@@ -843,6 +1142,12 @@ struct PreferredRoundingMode {
   static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest;
 };
 
+/// Defines preferred rounding mode for a pair of types
+template <>
+struct PreferredRoundingMode<tfloat32_t, float> {
+  static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index 2282e43e..9479ccb0 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -69,6 +69,10 @@ struct sizeof_bits<bin1_t> {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "cutlass/integer_subbyte.h"
+
 #include "cutlass/half.h"
+#include "cutlass/bfloat16.h"
+#include "cutlass/tfloat32.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index 36d290bb..826b3977 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
index ac4f0278..92936962 100644
--- a/include/cutlass/predicate_vector.h
+++ b/include/cutlass/predicate_vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/real.h b/include/cutlass/real.h
index 8fa4d710..45ab1864 100644
--- a/include/cutlass/real.h
+++ b/include/cutlass/real.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -31,6 +31,7 @@ template <typename T>
 struct RealType {
   using Type = T;
 
+CUTLASS_HOST_DEVICE
   static T from_real(double x) {
     return static_cast<T>(x);
   }
diff --git a/include/cutlass/reduction/batched_reduction.h b/include/cutlass/reduction/batched_reduction.h
index 83324ec0..16132a02 100644
--- a/include/cutlass/reduction/batched_reduction.h
+++ b/include/cutlass/reduction/batched_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/batched_reduction_traits.h b/include/cutlass/reduction/batched_reduction_traits.h
index c44238e1..46157dc7 100644
--- a/include/cutlass/reduction/batched_reduction_traits.h
+++ b/include/cutlass/reduction/batched_reduction_traits.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h
new file mode 100644
index 00000000..e3626f88
--- /dev/null
+++ b/include/cutlass/reduction/device/reduce_split_k.h
@@ -0,0 +1,215 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ReductionKernel_
+>
+class ReduceSplitK {
+public:
+  using ReductionKernel = ReductionKernel_;
+
+  using Shape = typename ReductionKernel::Shape;
+  using ReductionOp = typename ReductionKernel::ReductionOp;
+  using OutputOp = typename ReductionKernel::OutputOp;
+
+  using ElementWorkspace = typename ReductionKernel::ElementWorkspace;
+  using ElementAccumulator = typename ReductionKernel::ElementAccumulator;
+  using ElementOutput = typename ReductionKernel::ElementOutput;
+
+  using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef;
+  using OutputTensorRef = typename ReductionKernel::OutputTensorRef;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    MatrixCoord problem_size;
+    int partitions;
+    size_t partition_stride;
+    WorkspaceTensorRef workspace;
+    OutputTensorRef destination;
+    OutputTensorRef source;
+    typename OutputOp::Params output;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() : 
+      problem_size(0, 0), 
+      partitions(1), 
+      partition_stride(0) { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      MatrixCoord const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(partition_stride_),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_)
+    {
+
+    }
+
+  };
+
+private:
+  /// Kernel parameters object
+  typename ReductionKernel::Params params_;
+
+public:
+  /// Constructs Reduction SplitK
+  ReduceSplitK() { }
+
+  /// Determines whether the ReduceSplitK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    // needs no additional workspace
+    return 0;
+  }
+
+  /// Initializes Reduction state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    // initialize the params structure from the arguments
+    params_ = typename ReductionKernel::Params(
+      args.problem_size,
+      args.partitions,
+      args.partition_stride,
+      args.workspace,
+      args.destination,
+      args.source,
+      args.output,
+      args.reduction
+    );
+
+    return Status::kSuccess;
+
+   }
+
+  /// Initializes Reduction kernel state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.workspace.reset(args.workspace.non_const_ref().data());
+    params_.destination.reset(args.destination.non_const_ref().data());
+    params_.source.reset(args.source.non_const_ref().data());
+    params_.output = args.output;
+    params_.reduction = args.reduction;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    //
+    // Launch reduction kernel
+    //
+    dim3 block = ReductionKernel::block_shape();
+    dim3 grid = ReductionKernel::grid_shape(params_.problem_size);
+
+    Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h
index 1869102f..586c90d8 100644
--- a/include/cutlass/reduction/kernel/reduce_split_k.h
+++ b/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -129,8 +129,8 @@ public:
     cutlass::MatrixCoord problem_size) {
 
     return dim3(
-      (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn, 
-      (problem_size.row() + Shape::kRow -1) / Shape::kRow);
+      (problem_size.row() + Shape::kRow - 1) / Shape::kRow,
+      (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn);
   }
 
   /// Determines the threadblock shape
@@ -145,8 +145,8 @@ public:
 
     // Determine CTA position
     MatrixCoord thread_offset(
-      int(blockIdx.y) * Shape::kRow + threadIdx.y,
-      int(blockIdx.x) * Shape::kColumn + threadIdx.x * kElementsPerAccess
+      int(blockIdx.x) * Shape::kRow + threadIdx.y,
+      int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess
     );
 
     // One guard conditional
diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h
index ae03c821..698b174f 100644
--- a/include/cutlass/reduction/thread/reduce.h
+++ b/include/cutlass/reduction/thread/reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h
index 3eed6209..6f9aeb6f 100644
--- a/include/cutlass/reduction/thread/reduction_operators.h
+++ b/include/cutlass/reduction/thread/reduction_operators.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h
index 6e42cada..2419cdf6 100644
--- a/include/cutlass/reduction/threadblock_swizzle.h
+++ b/include/cutlass/reduction/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index cb6d68ca..5714fbd2 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -145,6 +145,28 @@ bool relatively_equal<half_t>(half_t a, half_t b, half_t epsilon, half_t nonzero
   return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
 }
 
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bfloat16_t>(
+  bfloat16_t a, 
+  bfloat16_t b, 
+  bfloat16_t epsilon, 
+  bfloat16_t nonzero_floor) {
+  
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<tfloat32_t>(
+  tfloat32_t a, 
+  tfloat32_t b, 
+  tfloat32_t epsilon, 
+  tfloat32_t nonzero_floor) {
+  
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
 template <>
 CUTLASS_HOST_DEVICE
 bool relatively_equal<float>(float a, float b, float epsilon, float nonzero_floor) {
diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h
index 94b2eace..dc5523dc 100644
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,11 @@ public:
   CUTLASS_DEVICE
   void fetch() {
     if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
+      #else
       asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
+      #endif
     }
   }
 
@@ -94,7 +98,11 @@ public:
     __syncthreads();
 
     if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #else
       asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #endif
     }
   }
 };
diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h
index 9ce52901..6f7aab2c 100644
--- a/include/cutlass/subbyte_reference.h
+++ b/include/cutlass/subbyte_reference.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h
index 043f7a56..d7a6d0df 100644
--- a/include/cutlass/tensor_coord.h
+++ b/include/cutlass/tensor_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h
index 6567fe81..a805107c 100644
--- a/include/cutlass/tensor_ref.h
+++ b/include/cutlass/tensor_ref.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h
index 3efb16a5..a9cf569d 100644
--- a/include/cutlass/tensor_view.h
+++ b/include/cutlass/tensor_view.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -151,14 +151,20 @@ class TensorView : public TensorRef<Element_, Layout_> {
 
   /// Updates the pointer and layout object
   CUTLASS_HOST_DEVICE
-  void reset(Element* ptr, Layout const &layout, TensorCoord size) {
+  void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
     Base::reset(ptr, layout);
-    this->resize(extent_);
+    this->resize(extent);
+  }
+
+  /// Updates the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr) {
+    Base::reset(ptr);
   }
 
   /// Changes the size of the view without affecting pointer or layout
   CUTLASS_HOST_DEVICE
-  void resize(TensorCoord extent) {
+  void resize(TensorCoord const &extent) {
     this->extent_ = extent;
   }
 
diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h
new file mode 100644
index 00000000..64dc3914
--- /dev/null
+++ b/include/cutlass/tfloat32.h
@@ -0,0 +1,453 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing Tensor Float 32 data type.
+*/
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#endif
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor Float 32 data type
+struct alignas(4) tfloat32_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint32_t storage;
+
+  //
+  // Methods
+  //
+
+  /// Constructs from an unsigned int
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t bitcast(uint32_t x) {
+    tfloat32_t h;
+    h.storage = x;
+    return h;
+  }
+
+  /// Emulated rounding is fast in device code
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t round_half_ulp_truncate(float const &s) {
+    uint32_t x = reinterpret_cast<uint32_t const &>(s);
+
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #endif
+
+    return tfloat32_t::bitcast(x);
+  }
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  tfloat32_t() { }
+
+  /// Floating-point conversion - round toward nearest even
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).storage) { }
+
+  /// Floating-point conversion - round toward nearest even
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(double x): tfloat32_t(float(x)) {
+
+  }
+
+  /// Integer conversion - round toward zero
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(int x) {
+    float flt = static_cast<float>(x);
+    storage = reinterpret_cast<uint32_t const &>(flt);
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+
+    // Conversions to IEEE single-precision requires clearing dont-care bits
+    // of the mantissa.
+    unsigned bits = (storage & ~0x1fffu);
+    
+    return reinterpret_cast<float const &>(bits);
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator double() const {
+    return double(float(*this));
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint32_t raw() const {
+    return storage;
+  }
+
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x80000000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 23) & 0x0ff);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7fffff);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::tfloat32_t const& h) {
+  return h.signbit();
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) {
+  return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t nan_tf32(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::tfloat32_t::bitcast(0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::tfloat32_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::tfloat32_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::tfloat32_t(sqrtf(float(h)));
+#else
+  return cutlass::tfloat32_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) {
+
+  uint32_t a_mag = (reinterpret_cast<uint32_t const &>(a) & 0x7fffffff);  
+  uint32_t b_sign = (reinterpret_cast<uint32_t const &>(b) & 0x80000000);
+  uint32_t result = (a_mag | b_sign);
+
+  return reinterpret_cast<tfloat32_t const &>(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace std {
+
+#if !defined(__CUDACC_RTC__)
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::tfloat32_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 19;
+
+  /// Least positive value
+  static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); }
+
+  /// Maximum finite value
+  static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); }
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace std
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) + float(rhs));
+}
+
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs) {
+  float x = -reinterpret_cast<float const &>(lhs);
+  return reinterpret_cast<tfloat32_t const &>(x);
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) + float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) - float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) * float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) / float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator++(tfloat32_t & lhs) {
+  float tmp(lhs);
+  ++tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator--(tfloat32_t & lhs) {
+  float tmp(lhs);
+  --tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator++(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp++;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator--(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp--;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(long double x) {
+  return cutlass::tfloat32_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) {
+  return cutlass::tfloat32_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h
index 1e1f3eeb..a54b3471 100644
--- a/include/cutlass/thread/matrix.h
+++ b/include/cutlass/thread/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h
index 71edb936..812dbd77 100644
--- a/include/cutlass/transform/pitch_linear_thread_map.h
+++ b/include/cutlass/transform/pitch_linear_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h
index 552295d8..268e6481 100644
--- a/include/cutlass/transform/thread/transpose.h
+++ b/include/cutlass/transform/thread/transpose.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unaryOp.h
new file mode 100644
index 00000000..de4f79b9
--- /dev/null
+++ b/include/cutlass/transform/thread/unaryOp.h
@@ -0,0 +1,101 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+namespace transform {
+namespace thread {
+
+namespace UnaryTransform {
+    struct Identity;    ///< None (i.e., identity)
+    struct Conjugate;   ///< Complex conjugate
+}
+
+/// Element-wise unary operator that transforms one element of a fragment at a time
+template<
+    typename FragmentIn, ///< Input Fragment
+    typename FragmentOut,///< Output Fragment
+    typename Transform>  ///< Unary transform operator
+class UnaryOp
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentOut execute(FragmentIn &in)
+        {
+            static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match.");
+            static_assert(std::is_same<Transform, UnaryTransform::Identity>::value ||
+                          std::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+
+            FragmentOut out;
+            if( std::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                CUTLASS_PRAGMA_UNROLL
+                for(int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = static_cast<typename FragmentOut::Element>(in[i]);
+                }
+            }
+            else if( std::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for(int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = conj(static_cast<typename FragmentOut::Element>(in[i]));
+                }
+            }
+            return out;
+        }
+};
+
+template<typename FragmentIn, typename Transform>
+class UnaryOp<FragmentIn, FragmentIn, Transform>
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentIn execute(FragmentIn &in)
+        {
+            static_assert(std::is_same<Transform, UnaryTransform::Identity>::value ||
+                          std::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+
+            if( std::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                return in;
+            }
+            else if( std::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for(int i=0; i < FragmentIn::kElements; ++i){
+                   in[i] = conj(in[i]);
+                }
+            }
+            return in;
+        }
+};
+}
+}
+}
+
+
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
index 2ab40add..c77a09ff 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
@@ -216,6 +216,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
       predicates_[i] = 0u;
     }
 
+    CUTLASS_PRAGMA_UNROLL
     for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
 
       int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
index 37c995b4..97ab909c 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
index baeb518a..48d25ef4 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -319,9 +319,11 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
 
           AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
 
-          if (address_iterator_.valid()) {
-              frag_ptr[idx] = *access_ptr;
-          }
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
           ++address_iterator_;
         }
       }
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
index 58d2f7e3..0342a434 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
index 2047723d..0d775dff 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
index 73174b57..31f529e0 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
index 6230b7a7..6eef4b52 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -798,6 +798,269 @@ class RegularTileAccessIterator<Shape_, Element_,
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                    InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                      InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+  };
+
+ private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(
+        ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+
+    int access_offset =
+        (iteration_strided_ * ThreadMap::Delta::kStrided * Layout::kInterleavedK +
+        iteration_contiguous_ * ThreadMap::Delta::kContiguous) / ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kCount);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                             InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                                         InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+    cutlass::MatrixShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value, InterleavedK>,
+    (kAdvanceRank == 1 ? 0 : 1),
+    ThreadMap
+  >;
+
+ private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return iterator_.get();
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.strided(), coord.contiguous()});
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace threadblock
 }  // namespace transform
 }  // namespace cutlass
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
new file mode 100644
index 00000000..5a0c74fd
--- /dev/null
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -0,0 +1,1522 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  Index stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous + 
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous64b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+
+    ///< Number of pointers - two pointers are needed if making more than 4 iterations along
+    ///< strided dimension
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  Index stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_[Detail::kPointerCount];
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data());
+
+    byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element);
+    
+    if (Detail::kPointerCount == 2) {
+      byte_offset_[1] = byte_offset_[0] ^ 8;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset / ThreadMap::kElementsPerAccess;
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    // Map the logical contiguous and strided access to the internal swizzled structure.
+    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16;
+
+    char *access_byte_ptr = reinterpret_cast<char *>(pointer_ + uniform_offset);
+
+    int byte_offset;
+
+    // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations)
+    // in the strided dimension
+    if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) {
+      byte_offset = byte_offset_[1];
+    }
+    else {
+      byte_offset = byte_offset_[0];
+    }
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicand64bCrosswise,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  Index stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous + 
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous128b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+
+  static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension");
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  Index stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2);
+    int offset_s = (iteration_strided_ / 2) * 8;
+
+    int access_offset = offset_c * stride_ + offset_s;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous * stride_ + 
+      coord.strided() * Shape::kStrided * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h
index 8445b836..d7928ac0 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
index 93849c65..c3f0b524 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
index 4ea47293..85d702fe 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
index 21176880..c7f06907 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -831,6 +831,269 @@ class RegularTileIterator<Shape_, Element_,
 };
 
 ////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                    InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                      InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+  };
+
+ private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : address_iterator_(ref, thread_id) {}
+ 
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_pointer_offset(Shape::kCount);
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_pointer_offset(coord.contiguous() * Shape::kCount);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                             InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                                         InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    cutlass::MatrixShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value, InterleavedK>,
+    (kAdvanceRank == 1 ? 0 : 1),
+    ThreadMap
+  >;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.strided(), coord.contiguous()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace threadblock
 } // namespace transform
 } // namespace cutlass
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
index ff5f0b45..82c8842e 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/include/cutlass/util/debug.h b/include/cutlass/util/debug.h
deleted file mode 100644
index 9941b41a..00000000
--- a/include/cutlass/util/debug.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * \brief Debugging and logging functionality
- */
-
-#include <stdio.h>
-
-namespace cutlass {
-
-/******************************************************************************
- * Debug and logging macros
- ******************************************************************************/
-
-/**
- * Formats and prints the given message to stdout
- */
-#if !defined(CUDA_LOG)
-#if !defined(__CUDA_ARCH__)
-#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
-#else
-#define CUDA_LOG(format, ...)                              \
-  printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
-         blockIdx.x,                                       \
-         blockIdx.y,                                       \
-         blockIdx.z,                                       \
-         threadIdx.x,                                      \
-         threadIdx.y,                                      \
-         threadIdx.z,                                      \
-         __VA_ARGS__);
-#endif
-#endif
-
-/**
- * Formats and prints the given message to stdout only if DEBUG is defined
- */
-#if !defined(CUDA_LOG_DEBUG)
-#ifdef DEBUG
-#define CUDA_LOG_DEBUG(format, ...) CUDA_LOG(format, __VA_ARGS__)
-#else
-#define CUDA_LOG_DEBUG(format, ...)
-#endif
-#endif
-
-/**
- * \brief The corresponding error message is printed to \p stderr (or \p stdout in device code)
- * along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl(cudaError_t error,
-                                                     const char* filename,
-                                                     int line) {
-  (void)filename;
-  (void)line;
-  if (error) {
-#if !defined(__CUDA_ARCH__)
-    fprintf(
-        stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-    fflush(stderr);
-#else
-    printf("CUDA error %d [%s, %d]\n", error, filename, line);
-#endif
-  }
-  return error;
-}
-
-/**
- * \brief Perror macro
- */
-#ifndef CUDA_PERROR
-#define CUDA_PERROR(e) cuda_perror_impl((cudaError_t)(e), __FILE__, __LINE__)
-#endif
-
-/**
- * \brief Perror macro with exit
- */
-#ifndef CUDA_PERROR_EXIT
-#define CUDA_PERROR_EXIT(e)                                     \
-  if (cuda_perror_impl((cudaError_t)(e), __FILE__, __LINE__)) { \
-    exit(1);                                                    \
-  }
-#endif
-
-/**
- * \brief Perror macro only if DEBUG is defined
- */
-#ifndef CUDA_PERROR_DEBUG
-#ifdef DEBUG
-#define CUDA_PERROR_DEBUG(e) CUDA_PERROR(e)
-#else
-#define CUDA_PERROR_DEBUG(e) (e)
-#endif
-#endif
-
-}  // namespace cutlass
diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
index 7758309e..e8096139 100644
--- a/include/cutlass/wmma_array.h
+++ b/include/cutlass/wmma_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md
index ffab354e..9a00d305 100644
--- a/media/docs/code_organization.md
+++ b/media/docs/code_organization.md
@@ -88,6 +88,7 @@ tools/
       cutlass/
         library/             # header files for CUTLASS Deliverables Library (in cutlass::library:: namespace)
 
+          handle.h           # implements a host-side API for launching kernels, similar to cuBLAS
           library.h          # defines enums and structs to describe the tiled structure of operator instances          
           manifest.h         # collection of all instances
 
@@ -175,6 +176,14 @@ examples/
   07_volta_tensorop_gemm/    # example demonstrating mixed precision GEMM using Volta Tensor Cores
 
   08_turing_tensorop_gemm/   # example demonstrating integer GEMM using Turing Tensor Cores
+
+  10_planar_complex/         # example demonstrating planar complex GEMM kernels
+
+  11_planar_complex_array/   # example demonstrating planar complex kernels with batch-specific problem sizes
+
+  12_gemm_bias_relu/         # example demonstrating GEMM fused with bias and relu
+
+  13_fused_two_gemms/        # example demonstrating two GEMms fused in one kernel
 ```
 
 ## Media
@@ -211,7 +220,7 @@ of tests run may vary over time as more are added.
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md
index 6b8e09dd..15656d25 100644
--- a/media/docs/doxygen_mainpage.md
+++ b/media/docs/doxygen_mainpage.md
@@ -120,7 +120,7 @@ cudaError_t cutlass_sgemm_nn(
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md
index d601ff5a..7a1a6ae7 100644
--- a/media/docs/efficient_gemm.md
+++ b/media/docs/efficient_gemm.md
@@ -216,6 +216,7 @@ participating warps - since each warp now owns a partial sum (since they compute
 The following additional resources describe design and implementation details of GEMMs
 targeting NVIDIA GPUs.
 
+- [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100.](https://www.nvidia.com/en-us/gtc) (SR 21745)
 - [CUTLASS: Fast Linear Algebra in CUDA C++](https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/)
 - [CUTLASS: SOFTWARE PRIMITIVES FOR DENSE LINEAR ALGEBRA AT ALL LEVELS AND SCALES WITHIN CUDA](https://on-demand-gtc.gputechconf.com/gtcnew/sessionview.php?sessionName=s8854-cutlass%3a+software+primitives+for+dense+linear+algebra+at+all+levels+and+scales+within+cuda)
 - [Programming Tensor Cores: NATIVE VOLTA TENSOR CORES WITH CUTLASS](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9593-cutensor-high-performance-tensor-operations-in-cuda-v2.pdf)
@@ -224,7 +225,7 @@ targeting NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/functionality.md b/media/docs/functionality.md
index de8da82d..465fae7d 100644
--- a/media/docs/functionality.md
+++ b/media/docs/functionality.md
@@ -27,7 +27,16 @@ Hyperlinks to relevant unit tests demonstrate how specific template instances ma
 | **TensorOp**        | 75                 |  10.2+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) |
 | **TensorOp**        | 75                 |  10.2+           | `s4 * s4 + s32 => {s32, s4}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) |
 | **TensorOp**        | 75                 |  10.2+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) |
-
+| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `s4 * s4 + s32 => {s32, s4}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `f64 * f64 + f64 => f64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `cf32 * cf32 + cf32 => cf32`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) |
+| **TensorOp**        | 80                 |  11.0+           | `cf64 * cf64 + cf64 => cf64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) |
 
 ## Warp-level Matrix Multiply with Tensor Cores
 
@@ -37,9 +46,13 @@ The following table summarizes supported warp level shapes for each TensorOp ins
 |-----------------|-----------------------|--------------------------------------------|
 | **TensorOp**    | 8-by-8-by-4           | 32x32x4, 32x64x4, 64x32x4, 64x64x4         |
 | **TensorOp**    | 16-by-8-by-8          | 32x32x8, 32x64x8, 64x32x8, 64x64x8         |
+| **TensorOp**    | 16-by-8-by-16         | 32x32x16, 32x64x16, 64x32x16, 64x64x16     |
 | **TensorOp**    | 8-by-8-by-16          | 32x32x16, 32x64x16, 64x32x16, 64x64x16     |
 | **TensorOp**    | 8-by-8-by-32          | 32x32x32, 32x64x32, 64x32x32, 64x64x32     |
+| **TensorOp**    | 16-by-8-by-32         | 32x32x32, 32x64x32, 64x32x32, 64x64x32     |
+| **TensorOp**    | 16-by-8-by-64         | 32x32x64, 32x64x64, 64x32x64, 64x64x64     |
 | **TensorOp**    | 8-by-8-by-128         | 32x32x128, 32x64x128, 64x32x128, 64x64x128 |
+| **TensorOp**    | 16-by-8-by-256        | 32x32x256, 32x64x256, 64x32x256, 64x64x256 |
 
 TensorOp instructions depend on a permuted shared memory layout that can be efficiently
 loaded from. The following tables summarize the destination shared memory layout that
@@ -68,6 +81,38 @@ from global memory with layout specified in the column "GMEM Layout."
 |  **C**    | `half_t`     | `RowMajor`      | `RowMajor`                         |
 |  **C**    | `float`      | `RowMajor`      | `RowMajor`                         |
 
+**TensorOp 16-by-8-by-8.**
+
+|**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
+|-----------|--------------|-----------------|------------------------------------|
+|  **A**    | `tfloat32_t`     | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<32>` |
+|  **A**    | `tfloat32_t`     | `RowMajor`      | `RowMajorTensorOpCrosswise<32>`    |
+|  **B**    | `tfloat32_t`     | `ColumnMajor`   | `ColumnMajorTensorOpCrosswise<32>` |
+|  **B**    | `tfloat32_t`     | `RowMajor`      | `RowMajorTensorOpCongruous<32>`    |
+|  **C**    | `float`          | `RowMajor`      | `RowMajor`                         |
+
+
+**TensorOp 16-by-8-by-16.**
+
+|**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
+|-----------|--------------|-----------------|------------------------------------|
+|  **A**    | `half_t`, `bfloat16_t`     | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<16>` |
+|  **A**    | `half_t`, `bfloat16_t`     | `RowMajor`      | `RowMajorTensorOpCrosswise<16>`    |
+|  **B**    | `half_t`, `bfloat16_t`     | `ColumnMajor`   | `ColumnMajorTensorOpCrosswise<16>` |
+|  **B**    | `half_t`, `bfloat16_t`     | `RowMajor`      | `RowMajorTensorOpCongruous<16>`    |
+|  **C**    | `half_t`     | `RowMajor`      | `RowMajor`                         |
+|  **C**    | `float`      | `RowMajor`      | `RowMajor`                         |
+
+**TensorOp 8-by-8-by-4.**
+
+|**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
+|-----------|--------------|-----------------|------------------------------------|
+|  **A**    | `double`     | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<64>` |
+|  **A**    | `double`     | `RowMajor`      | `RowMajorTensorOpCrosswise<64>`    |
+|  **B**    | `double`     | `ColumnMajor`   | `ColumnMajorTensorOpCrosswise<64>` |
+|  **B**    | `double`     | `RowMajor`      | `RowMajorTensorOpCongruous<64>`    |
+|  **C**    | `double`     | `RowMajor`      | `RowMajor`                         |
+
 **TensorOp 8-by-8-by-16.**
 
 |**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
@@ -76,6 +121,14 @@ from global memory with layout specified in the column "GMEM Layout."
 |  **B**    | `int8_t`     | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<8>`  |
 |  **C**    | `int32_t`    | `RowMajor`      | `RowMajor`                         |
 
+**TensorOp 16-by-8-by-32.**
+
+|**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
+|-----------|--------------|-----------------|------------------------------------|
+|  **A**    | `int8_t`     | `RowMajor`      | `RowMajorTensorOpCrosswise<8>`     |
+|  **B**    | `int8_t`     | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<8>`  |
+|  **C**    | `int32_t`    | `RowMajor`      | `RowMajor`                         |
+
 **TensorOp 8-by-8-by-32.**
 
 |**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
@@ -84,6 +137,14 @@ from global memory with layout specified in the column "GMEM Layout."
 |  **B**    | `int4b_t`    | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<4>`  |
 |  **C**    | `int32_t`    | `RowMajor`      | `RowMajor`                         |
 
+**TensorOp 16-by-8-by-64.**
+
+|**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
+|-----------|--------------|-----------------|------------------------------------|
+|  **A**    | `int4b_t`    | `RowMajor`      | `RowMajorTensorOpCrosswise<4>`     |
+|  **B**    | `int4b_t`    | `ColumnMajor`   | `ColumnMajorTensorOpCongruous<4>`  |
+|  **C**    | `int32_t`    | `RowMajor`      | `RowMajor`                         |
+
 **TensorOp 8-by-8-by-128.**
 
 |**Operand**|**Element**   | **GMEM Layout** | **SMEM Layout**                    |
@@ -119,7 +180,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md
index 98374022..7556cd45 100644
--- a/media/docs/fundamental_types.md
+++ b/media/docs/fundamental_types.md
@@ -16,6 +16,8 @@ Most types in CUTLASS are usable in both host code and device code. Moreover, th
 CUTLASS defines classes for the following numeric data types.
 
 * `half_t`: IEEE half-precision floating point (exponent: 5b, mantissa: 10b; literal suffix `_hf`)
+* `bfloat16_t`: BFloat16 data type (exponent: 8b, mantissa: 7b; literal suffix `_bf16`)
+* `tfloat32_t`: Tensor Float 32 data type (exponent: 8b, mantissa: 10b; literal suffix `_tf32`)
 * `int4_t`, `uint4_t`: 4b signed and unsigned integer (literal suffx `_s4`, `_u4`)
 * `bin1_t`: 1b binary numeric type (literal suffix `_b1`)
 * `complex<T>`: defines complex-valued data type based on the supplied real-valued numeric type
@@ -182,6 +184,39 @@ AlignedArray<half_t, 8> *ptr = reinterpret_cast<AlignedArray<half_t, 8> *>(smem_
 AlignedArray<half_t, 8> x = ptr[threadIdx.x];     // 128b shared memory load
 ```
 
+### Numeric Conversion
+
+CUTLASS defines procedures for performing numeric conversion between data types in `cutlass/numeric_conversion.h`. 
+Where possible, these target hardware acceleration on the target architecture and support multiple rounding modes.
+
+```c++
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+NumericConverter<half_t, float>     convert_f32_to_f16;
+NumericConverter<tfloat32_t, float> convert_f32_to_tf32;
+
+half_t     x = convert_f32_to_f16(3.14159f);
+tfloat32_t y = convert_f32_to_tf32(3.14159f);
+```
+
+Recent GPU architectures such as NVIDIA Turing and Ampere combine numeric conversion with efficient packing
+into bit vectors. Consequently, CUTLASS defines conversion on both scalars and `Array<>` objects to implement 
+the optimal code sequence on all architectures.
+
+```c++
+//
+// Example: convert and pack 32b signed integers to a vector of packed signed 8-bit integers.
+//
+int const kN = 16;
+Array<int8_t, kN> destination;
+Array<int,    kN> source;
+
+NumericConverter<descltype(destination), decltype(source)> convert;
+
+destination = convert(source);
+```
+
 ### Coord
 
 ```c++
@@ -311,7 +346,7 @@ support on current and future NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md
index 0d58cd36..759b1cd4 100644
--- a/media/docs/gemm_api.md
+++ b/media/docs/gemm_api.md
@@ -514,7 +514,7 @@ to inline PTX.
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/layout.md b/media/docs/layout.md
index fc36a276..bacec0e4 100644
--- a/media/docs/layout.md
+++ b/media/docs/layout.md
@@ -267,7 +267,7 @@ Permuted Shared Memory Layouts:
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 34051651..ad4c58ab 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -15,7 +15,7 @@ $ make cutlass_profiler -j
 To limit compilation time, only one tile size (128x128) is instantiated for each data type, math instruction, and layout.
 To instantiate all sizes, set the following environment variable when running CMake from an empty `build/` directory.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
+$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all
 ...
 $ make cutlass_profiler -j
 ```
@@ -102,7 +102,7 @@ Report:
   --verbose=<bool>                           If true (default), prints human-readable text to stdout.
 
 About:
-  --version                                  CUTLASS 2.0.0 built on Nov  19 2019 at 13:01:00
+  --version                                  CUTLASS 2.2.0 built on Jun  8 2020 at 07:59:33
 
 Operations:
   --operation=<operation_name>               Specifies a particular operation to run or print the usage statement.
@@ -191,29 +191,34 @@ Test your changes to gemm kernels with a quick functional test and save results
 
 Example command line for profiling SGEMM kernels is as follows:
 ```bash
-$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096
+$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
+
+
 
 =============================
   Problem ID: 1
 
-    Provider: CUTLASS
-   Operation: cutlass_simt_sgemm_128x128_nn
+        Provider: CUTLASS
+   OperationKind: gemm
+       Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1
 
- Disposition: Passed
-      Status: Success
+          Status: Success
+    Verification: ON
+     Disposition: Passed
 
-   Arguments:  --m=4352 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0        \
-               --split_k_slices=1 --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8  \
-               --stages=2 --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50       \
-               --max_cc=1024
+          cuBLAS: Passed
 
-       Bytes: 52428800  bytes
-       FLOPs: 146064539648  flops
+       Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1  \
+                  --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
 
-     Runtime: 10.5424  ms
-      Memory: 4.63158 GiB/s
+           Bytes: 180355072  bytes
+           FLOPs: 115992428544  flops
 
-        Math: 13854.9 GFLOP/s
+         Runtime: 6.73655  ms
+          Memory: 24.934 GiB/s
+
+            Math: 17218.4 GFLOP/s
 ```
 
 Note, the arguments which appear in the output may be used as command line parameters for subsequent invocations.
@@ -224,31 +229,34 @@ Note, the arguments which appear in the output may be used as command line param
 To execute kernels targeting Tensor Core operations, supply the flag `--op_class=tensorop` in the command line.
 
 ```bash
-$ ./tools/profiler/cutlass_profiler --op_class=tensorop
+$ ./tools/profiler/cutlass_profiler --op_class=tensorop --m=3456 --n=4096 --k=8192
+
+
 
 =============================
   Problem ID: 1
 
-    Provider: CUTLASS
-   Operation: cutlass_turing_h1688gemm_128x128_nt
+        Provider: CUTLASS
+   OperationKind: gemm
+       Operation: cutlass_tensorop_s16816gemm_f16_256x128_32x3_nn_align8
 
- Disposition: Passed
-      Status: Success
+          Status: Success
+    Verification: ON
+     Disposition: Passed
 
-   Arguments:  --m=4352 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f16:column --alpha=1 --beta=0   \
-               --op_class=tensorop --accum=f16 --cta_m=128 --cta_n=128 --cta_k=32 --stages=2             \
-               --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8                     \
-               --min_cc=75 --max_cc=1024
+          cuBLAS: Passed
 
+       Arguments: --m=3456 --n=4096 --k=8192 --A=f16:column --B=f16:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1  \
+                  --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128 --cta_k=32 --stages=3 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024
 
-       Bytes: 52428800  bytes
-       FLOPs: 146064539648  flops
+           Bytes: 180355072  bytes
+           FLOPs: 231956545536  flops
 
-     Runtime: 1.51255  ms
-      Memory: 32.2821 GiB/s
-
-        Math: 96568.7 GFLOP/s
+         Runtime: 0.98647  ms
+          Memory: 170.272 GiB/s
 
+            Math: 235138 GFLOP/s
 ```
 
 ## Covering the problem space
@@ -271,7 +279,7 @@ with the `--output=<filename.csv>` command line option as shown:
 
 ```bash
 $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn            \
-                                    --m=4352 --n=4096 --k=8:4096:8 --output=report.csv
+                                    --m=3456 --n=4096 --k=8:4096:8 --output=report.csv
 ```
 
 To faclitate generation of pivot tables and charts, additional columns may be prepended with the
@@ -279,13 +287,13 @@ To faclitate generation of pivot tables and charts, additional columns may be pr
 
 ```bash
 $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn            \
-                                    --m=4352 --n=4096 --k=8:4096:8 --output=report.csv \
-                                    --tags=cutlass:2.0,date:2019-11-19
+                                    --m=3456 --n=4096 --k=8:4096:8 --output=report.csv \
+                                    --tags=cutlass:2.2,date:2020-06-08
 ```  
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md
index 5ce16af1..0cf7ea25 100644
--- a/media/docs/programming_guidelines.md
+++ b/media/docs/programming_guidelines.md
@@ -104,6 +104,14 @@ for (int idx = 0; idx < kN; ++idx) {      // Loop has constant number of iterati
 
 ## Style
 
+### C++ Style
+
+CUTLASS source code follows the 
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions.
+
+Design choices should be consistent with the 
+[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter.
+
 ### CUDA Built-in Variables
 
 Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within
@@ -132,14 +140,6 @@ In particular, be sure to use:
 Avoid defining alternative implementations of the same functionality. Instead, prefer to enhance
 or extend additional components where it makes sense.
 
-### C++ Style
-
-CUTLASS source code follows the 
-[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions.
-
-Design choices should be consistent with the 
-[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter.
-
 ### Classes and Structs
 
 Type names use `CapitalLetters` except when implementations are a _perfect_ drop-in replacement for
@@ -178,9 +178,10 @@ Members within classes and structures should be organized as follows:
 3. Constructors
 4. Other methods
 
-This convention follows the [CUB library](https://nvlabs.github.io/cub/), 
-and it also approximates the usual order of Systems and Controls textbooks. That is, they start by 
-(1.) identifying relevant constants, (2.) define a state-space representation of the dynamical system 
+This convention follows the [CUB library](https://nvlabs.github.io/cub/) and is also described by 
+[Howard Hinnant](https://howardhinnant.github.io/classdecl.html). Unsurprisingly, it approximates 
+the usual ordering of chapters in a typical Systems and Controls textbook. That is,
+(1.) identify relevant constants, (2.) define a state-space representation of the dynamical system 
 under study (i.e. the data members), and (3.) devote subsequent chapters to definining dynamical behavior
 of the system (i.e. the methods).
 
@@ -291,7 +292,7 @@ Github's pretty printer.
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index 5f459223..4587b7d2 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -7,7 +7,7 @@
 ## Prerequisites
 
 CUTLASS requires:
-- NVIDIA CUDA Toolkit (9.2 or later required, 10.2 recommended)
+- NVIDIA CUDA Toolkit (9.2 or later required, [11.0](https://developer.nvidia.com/cuda-toolkit) recommended)
 - CMake 3.12+
 - host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended)
 - Python 3.6+
@@ -20,23 +20,7 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 
 $ mkdir build && cd build
 
-$ cmake .. -DCUTLASS_NVCC_ARCHS=75               # compiles for NVIDIA's Turing GPU architecture
-```
-
-## Clang
-
-For experimental purposes, CUTLASS may be compiled with 
-[clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the 
-[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive).
-At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm)
-but does not enable TensorCores.
-
-```bash
-$ mkdir build && cd build
-
-$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
-
-$ make test_unit -j
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA Ampere GPU architecture
 ```
 
 ## Build and run the CUTLASS Profiler
@@ -120,6 +104,53 @@ $ make test_unit_gemm_warp -j
 [100%] Built target test_unit_gemm_warp
 ```
 
+## Building for Multiple Architectures
+
+To minimize compilation time, specific GPU architectures can be enabled via the CMake command,
+selected by [CUDA Compute Capability.](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities)
+
+**NVIDIA Ampere Architecture.**
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA Ampere GPU architecture
+```
+
+**NVIDIA Turing Architecture.**
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=75               # compiles for NVIDIA Turing GPU architecture
+```
+
+**NVIDIA Volta Architecture.**
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=70               # compiles for NVIDIA Volta GPU architecture
+```
+
+**NVIDIA Pascal Architecture.**
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS="60;61"          # compiles for NVIDIA Pascal GPU architecture
+```
+
+**NVIDIA Maxwell Architecture.**
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS="50;53"          # compiles for NVIDIA Maxwell GPU architecture
+```
+
+## Clang
+
+For experimental purposes, CUTLASS may be compiled with 
+[clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the 
+[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive).
+At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm)
+but does not enable TensorCores.
+
+```bash
+$ mkdir build && cd build
+
+$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+
+$ make test_unit -j
+```
+
+
 ## Using CUTLASS within other applications
 
 Applications should list [`/include`](/include) within their include paths. They must be
@@ -143,10 +174,10 @@ int main() {
 
 ## Launching a GEMM kernel in CUDA
 
-**Example:** launch a mixed-precision GEMM targeting Volta Tensor Cores.
+**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores.
 ```c++
 #include <cutlass/numeric_types.h>
-#include <cutlas/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm.h>
 #include <cutlass/util/host_tensor.h>
 
 int main() {
@@ -161,7 +192,7 @@ int main() {
     cutlass::layout::ColumnMajor,              // LayoutOutput
     float,                                     // ElementAccumulator
     cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
-    cutlass::arch::Sm70                        // tag indicating target GPU compute architecture
+    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
   >;
 
   Gemm gemm_op;
@@ -193,7 +224,7 @@ int main() {
   int lda = A.device_ref().stride(0);
   int ldb = B.device_ref().stride(0);
   int ldc = C.device_ref().stride(0);
-  int ldd = D.device_ref().stride(0);
+  int ldd = C.device_ref().stride(0);
   //
   // Launch GEMM on the device
   //
@@ -372,9 +403,14 @@ To instantiate kernels of all tile sizes, data types, and alignment constraints,
 
 Several recipes are defined below for convenience. They may be combined as a comma-delimited list.
 
-**Example.** All kernels for Volta and Turing architectures.
+**Example.** All GEMM kernels targeting NVIDIA Ampere Tensor Cores.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=all
+$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=tensorop*gemm
+```
+
+**Example.** All kernels for NVIDIA Volta, Turing, and Ampere architectures.
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all
 ```
 
 **Example.** All GEMM kernels targeting Turing Tensor Cores.
@@ -384,17 +420,17 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=tensorop*gemm
 
 **Example.** All GEMM kernels with single-precision accumulation.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=s*gemm
+$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=s*gemm
 ```
 
 **Example.** All kernels which expect A and B to be column-major.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=gemm*nn
+$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=gemm*nn
 ```
 
 **Example.** All planar complex GEMM variants.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=planar_complex
+$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=planar_complex
 ```
 
 
diff --git a/media/docs/terminology.md b/media/docs/terminology.md
index 1ef0b383..07464143 100644
--- a/media/docs/terminology.md
+++ b/media/docs/terminology.md
@@ -74,7 +74,7 @@ contiguous and strided dimensions of a tile.
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md
index 4fd068f8..061ff907 100644
--- a/media/docs/tile_iterator_concept.md
+++ b/media/docs/tile_iterator_concept.md
@@ -466,7 +466,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/docs/utilities.md b/media/docs/utilities.md
index e3d2a52c..b9ddc79a 100644
--- a/media/docs/utilities.md
+++ b/media/docs/utilities.md
@@ -111,8 +111,8 @@ std::cout << tensor.host_view() << std::endl;
 
 ## Device Allocations
 
-To strictly allocate memory on the device using the smart pointers to manage allocation and deallocation,
-use `cutlass::device_memory::allocation<>`. 
+To strictly allocate memory on the device using the smart pointer pattern to manage allocation and deallocation,
+use `cutlass::DeviceAllocation<>`. 
 
 **Example:** allocating an array in device memory.
 ```c++
@@ -128,7 +128,7 @@ int main() {
 
   size_t N = 1024;
 
-  cutlass::device_memory::allocation<float> device_alloc(N);
+  cutlass::DeviceAllocation<float> device_alloc(N);
 
   // Call a CUDA kernel passing device memory as a pointer argument
   kernel<<< grid, block >>>(alloc.get());
@@ -340,8 +340,9 @@ used throughout the unit tests.
 ```c++
 #include <cutlass/numeric_types.h>
 #include <cutlass/layout/matrix.h>
-#include <cutlass/util/reference/host/gemm.h>
+
 #include <cutlass/util/host_tensor.h>
+#include <cutlass/util/reference/host/gemm.h>
 
 int main() {
 
@@ -378,7 +379,7 @@ int main() {
 
 # Copyright
 
-Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
diff --git a/media/images/cutlass-performance-plot.png b/media/images/cutlass-performance-plot.png
index 1d76a7e6..9caf0223 100644
Binary files a/media/images/cutlass-performance-plot.png and b/media/images/cutlass-performance-plot.png differ
diff --git a/media/images/gemm-hierarchy-with-epilogue-no-labels.png b/media/images/gemm-hierarchy-with-epilogue-no-labels.png
index 59bc99fb..b87e8e2e 100644
Binary files a/media/images/gemm-hierarchy-with-epilogue-no-labels.png and b/media/images/gemm-hierarchy-with-epilogue-no-labels.png differ
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1ada4b07..35994ba6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index b03a4d9e..610eee01 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -50,6 +50,7 @@ target_link_libraries(
   )
 
 set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
+set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
   
 function(cutlass_test_unit_add_executable)
 
@@ -76,7 +77,7 @@ function(cutlass_test_unit_add_executable)
   add_custom_target(
     ${NAME_STEM}
     COMMAND
-    $<TARGET_FILE:${NAME}>
+    ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${NAME}>
   DEPENDS
     ${NAME}
     )
diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
index ddbd186b..81908265 100644
--- a/test/unit/common/cutlass_unit_test.h
+++ b/test/unit/common/cutlass_unit_test.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
index 3bc2823c..0c548bdf 100644
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,6 +71,7 @@ void FilterArchitecture() {
     { "SM61*",                      61, kMaxDevice},
     { "SM70*",                      70, 75},
     { "SM75*",                      75, kMaxDevice},
+    { "SM80*",                      80, kMaxDevice},
     { 0, 0, false }
   };
 
diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt
index a7d0e211..d72f42fb 100644
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -24,6 +24,8 @@ cutlass_test_unit_add_executable(
   cutlass_test_unit_core
   array.cu
   half.cu
+  bfloat16.cu
+  tfloat32.cu
   complex.cu
   predicate_vector.cu
   tensor_ref.cu
diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu
index 72f5b5a8..5a8cc855 100644
--- a/test/unit/core/array.cu
+++ b/test/unit/core/array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -228,6 +228,14 @@ TEST(Array, Float16x8) {
 }
 #endif
 
+TEST(Array, FloatBF16x8) {
+  TestArray<cutlass::bfloat16_t, 8>().run();
+}
+
+TEST(Array, FloatTF32x4) {
+  TestArray<cutlass::tfloat32_t, 4>().run();
+}
+
 TEST(Array, Float32x4) {
   TestArray<float, 4>().run();
 }
diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu
new file mode 100644
index 00000000..9fa99ebb
--- /dev/null
+++ b/test/unit/core/bfloat16.cu
@@ -0,0 +1,209 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/array.h"
+#include "cutlass/core_io.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/host_tensor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void convert_bf16_f32(cutlass::bfloat16_t *output, float const *input, int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    output[tid] = static_cast<cutlass::bfloat16_t>(input[tid]);
+  }
+}
+
+__global__ void convert_and_pack_bf16(cutlass::bfloat16_t *output, float const *input, int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid * 2 < N) {
+
+    cutlass::NumericArrayConverter<cutlass::bfloat16_t, float, 2> convert;
+
+    cutlass::Array<cutlass::bfloat16_t, 2> *dst_ptr = 
+      reinterpret_cast<cutlass::Array<cutlass::bfloat16_t, 2> *>(output + tid * 2);
+
+    cutlass::Array<float, 2> const *src_ptr = 
+      reinterpret_cast<cutlass::Array<float, 2> const *>(input + tid * 2);
+
+    *dst_ptr = convert(*src_ptr);
+  } 
+}
+
+TEST(bfloat16_t, device_conversion) {
+  using T = cutlass::bfloat16_t;
+  using S = float;
+
+  int const N = 256;
+
+  cutlass::HostTensor<T, cutlass::layout::RowMajor> destination({N, 1});
+  cutlass::HostTensor<S, cutlass::layout::RowMajor> source({N, 1});
+
+  for (int i = 0; i < N; ++i) {
+    source.at({i, 0}) = float(i - 128);
+    destination.at({i, 0}) = T(0);
+  }
+
+  source.sync_device();
+  destination.sync_device();
+
+  convert_bf16_f32<<< dim3(1,1), dim3(N, 1) >>>(destination.device_data(), source.device_data(), N);
+
+  ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error.";
+
+  destination.sync_host();
+
+  int errors = 0;
+  for (int i = 0; i < N; ++i) {
+    T got = destination.at({i, 0});
+    S expected = source.at({i, 0});
+
+    if (S(got) != expected) {
+      ++errors;
+      if (errors < 10) {
+        std::cerr << "Basic conversion error - [" << i << "] - got " << got << ", expected " << expected << "\n";
+      }
+    }
+
+    destination.at({i, 0}) = T(0);
+  }
+
+  destination.sync_device();
+  
+  convert_and_pack_bf16<<< dim3(1,1), dim3(N, 1) >>>(destination.device_data(), source.device_data(), N);
+  
+  ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error.";
+  
+  destination.sync_host();
+
+  for (int i = 0; i < N; ++i) {
+    T got = destination.at({i, 0});
+    S expected = source.at({i, 0});
+
+    if (S(got) != expected) {
+      ++errors;
+      if (errors < 10) {
+        std::cerr << "Convert and pack error - [" << i << "] - got " << got << ", expected " << expected << "\n";
+      }
+    }
+  }
+
+  EXPECT_EQ(errors, 0);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Host
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(bfloat16_t, host_conversion) {
+  for (int i = -128; i < 128; ++i) {
+    float f = static_cast<float>(i);
+
+    cutlass::bfloat16_t x = static_cast<cutlass::bfloat16_t>(i);
+    cutlass::bfloat16_t y = static_cast<cutlass::bfloat16_t>(f);
+
+    EXPECT_TRUE(static_cast<int>(x) == i);
+    EXPECT_TRUE(static_cast<float>(y) == f);
+  }
+
+  // Try out user-defined literals
+  EXPECT_TRUE(cutlass::bfloat16_t(7) == 7_bf16);
+  EXPECT_TRUE(7 == static_cast<int>(7_bf16));
+}
+
+TEST(bfloat16_t, host_arithmetic) {
+
+  for (int i = -100; i < 100; ++i) {
+    for (int j = -100; j < 100; ++j) {
+
+      cutlass::bfloat16_t x = static_cast<cutlass::bfloat16_t>(i);
+      cutlass::bfloat16_t y = static_cast<cutlass::bfloat16_t>(j);
+
+      EXPECT_TRUE(static_cast<int>(x + y) == (i + j));
+    }
+  }
+}
+
+TEST(bfloat16_t, host_round) {
+  
+  struct {
+    uint32_t f32_bits;
+    uint16_t expected;
+  } tests[] = {
+    {0x40040000, 0x4004},  // M=0, R=0, S=0 => rtz
+    {0x40048000, 0x4004},  // M=0, R=1, S=0 => rtz
+    {0x40040001, 0x4004},  // M=0, R=1, S=1 => +inf
+    {0x4004c000, 0x4005},  // M=0, R=1, S=1 => +inf
+    {0x4004a000, 0x4005},  // M=0, R=1, S=1 => +inf
+    {0x40050000, 0x4005},  // M=1, R=0, S=0 => rtz
+    {0x40054000, 0x4005},  // M=1, R=0, S=1 => rtz
+    {0x40058000, 0x4006},  // M=1, R=1, S=0 => +inf
+    {0x40058001, 0x4006},  // M=1, R=1, S=1 => +inf
+    {0x7f800000, 0x7f80},  // +inf
+    {0xff800000, 0xff80},  // -inf
+    {0x7fffffff, 0x7fff},  // canonical NaN
+    {0x7ff00001, 0x7fff},  // NaN -> canonical NaN
+    {0xfff00010, 0x7fff},  // Nan -> canonical NaN
+    {0, 0}
+  };
+  
+  bool running = true;
+  for (int i = 0; running; ++i) {
+
+    float f32 = reinterpret_cast<float const &>(tests[i].f32_bits);
+
+    cutlass::bfloat16_t bf16 = cutlass::bfloat16_t(f32);
+
+    bool passed = (tests[i].expected == bf16.raw());
+    
+    EXPECT_TRUE(passed)
+      << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits 
+      << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << bf16.raw();
+
+    if (!tests[i].f32_bits) {
+      running = false;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Device
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu
index 946e2f26..9f70708d 100644
--- a/test/unit/core/complex.cu
+++ b/test/unit/core/complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu
index ba796655..ab843154 100644
--- a/test/unit/core/functional.cu
+++ b/test/unit/core/functional.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -411,3 +411,13 @@ TEST(Functional, multiply_add_f16x17) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+TEST(Functional, multiply_add_bf16x16) {
+  Functional_multiply_add_TxN<cutlass::bfloat16_t, 16>();
+}
+
+TEST(Functional, multiply_add_bf16x17) {
+  Functional_multiply_add_TxN<cutlass::bfloat16_t, 17>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu
index a0dcd966..be5e9b43 100644
--- a/test/unit/core/half.cu
+++ b/test/unit/core/half.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu
index 676bd2c0..841d4cb7 100644
--- a/test/unit/core/matrix_coord.cu
+++ b/test/unit/core/matrix_coord.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu
index ea062b73..5f8f3839 100644
--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu
index 17de2cd2..f9a0675c 100644
--- a/test/unit/core/predicate_vector.cu
+++ b/test/unit/core/predicate_vector.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu
index aa8a5633..6bedddc5 100644
--- a/test/unit/core/tensor_ref.cu
+++ b/test/unit/core/tensor_ref.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu
index b660b3d6..b35fc426 100644
--- a/test/unit/core/tensor_view.cu
+++ b/test/unit/core/tensor_view.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp
index 3823bd76..a6dfbf4b 100644
--- a/test/unit/core/test_unit_core.cpp
+++ b/test/unit/core/test_unit_core.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu
new file mode 100644
index 00000000..32155df7
--- /dev/null
+++ b/test/unit/core/tfloat32.cu
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/util/device_memory.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Host
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(tfloat32_t, host_conversion) {
+  for (int i = -1024; i < 1024; ++i) {
+    float f = static_cast<float>(i);
+
+    cutlass::tfloat32_t x = static_cast<cutlass::tfloat32_t>(i);
+    cutlass::tfloat32_t y = static_cast<cutlass::tfloat32_t>(f);
+
+    EXPECT_TRUE(static_cast<int>(x) == i);
+    EXPECT_TRUE(static_cast<float>(y) == f);
+  }
+
+  // Try out user-defined literals
+  EXPECT_TRUE(cutlass::tfloat32_t(7) == 7_tf32);
+  EXPECT_TRUE(7 == static_cast<int>(7_tf32));
+}
+
+TEST(tfloat32_t, host_arithmetic) {
+
+  for (int i = -100; i < 100; ++i) {
+    for (int j = -100; j < 100; ++j) {
+
+      cutlass::tfloat32_t x = static_cast<cutlass::tfloat32_t>(i);
+      cutlass::tfloat32_t y = static_cast<cutlass::tfloat32_t>(j);
+
+      EXPECT_TRUE(static_cast<int>(x + y) == (i + j));
+    }
+  }
+}
+
+TEST(tfloat32_t, host_round_nearest) {
+  
+  struct {
+    uint32_t f32_bits;
+    uint32_t expected;
+  } tests[] = {
+    {0x40000000, 0x40000000},  // M=0, R=0, S=0 => rtz
+    {0x40001000, 0x40000000},  // M=0, R=1, S=0 => rtz
+    {0x40000001, 0x40000000},  // M=0, R=0, S=1 => rtz
+    {0x40001001, 0x40002000},  // M=0, R=1, S=1 => +inf
+    {0x40002000, 0x40002000},  // M=1, R=0, S=0 => rtz
+    {0x40002001, 0x40002000},  // M=1, R=0, S=1 => rtz
+    {0x40003000, 0x40004000},  // M=1, R=1, S=0 => +inf
+    {0x40003001, 0x40004000},  // M=1, R=1, S=1 => +inf
+    {0x7f800000, 0x7f800000},  // +inf
+    {0xff800000, 0xff800000},  // -inf
+    {0x7fffffff, 0x7fffffff},  // canonical NaN to canonical NaN
+    {0x7f800001, 0x7fffffff},  // NaN to canonical NaN
+    {0xff800001, 0x7fffffff},  // NaN to canonical NaN
+    {0, 0}
+  };
+  
+  bool running = true;
+  for (int i = 0; running; ++i) {
+
+    float f32 = reinterpret_cast<float const &>(tests[i].f32_bits);
+
+    cutlass::NumericConverter<
+      cutlass::tfloat32_t, 
+      float, 
+      cutlass::FloatRoundStyle::round_to_nearest> converter;
+
+    cutlass::tfloat32_t tf32 = converter(f32);
+    
+    // note, we must explicitly truncate the low-order bits since they are not defined in TF32.
+    if (cutlass::isfinite(tf32)) {
+      tf32.storage &= 0xffffe000;
+    }
+
+    bool passed = (tests[i].expected == tf32.raw());
+    
+    EXPECT_TRUE(passed)
+      << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits 
+      << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << tf32.raw();
+
+    if (!tests[i].f32_bits) {
+      running = false;
+    }
+  }
+}
+
+namespace test {
+namespace core {
+
+__global__ void convert_tf32_half_ulp(cutlass::tfloat32_t *out, float const *in) {
+
+  cutlass::NumericConverter<
+    cutlass::tfloat32_t, 
+    float,
+    cutlass::FloatRoundStyle::round_half_ulp_truncate> convert;
+
+  *out = convert(*in);
+}
+
+}
+}
+
+
+TEST(tfloat32_t, host_round_half_ulp) {
+  
+  struct {
+    uint32_t f32_bits;
+    uint32_t expected;
+  } tests[] = {
+    {0x40001fff, 0x40002000},
+    {0x40000000, 0x40000000},  // M=0, R=0, S=0 => rtz
+    {0x40001000, 0x40002000},  // M=0, R=1, S=0 => rtz  - this difers from RNE
+    {0x40000001, 0x40000000},  // M=0, R=0, S=1 => rtz
+    {0x40001001, 0x40002000},  // M=0, R=1, S=1 => +inf
+    {0x40002000, 0x40002000},  // M=1, R=0, S=0 => rtz
+    {0x40002001, 0x40002000},  // M=1, R=0, S=1 => rtz
+    {0x40003000, 0x40004000},  // M=1, R=1, S=0 => +inf
+    {0x40003001, 0x40004000},  // M=1, R=1, S=1 => +inf
+    {0x7f800000, 0x7f800000},  // +inf
+    {0xff800000, 0xff800000},  // -inf
+    {0x7fffffff, 0x7fffffff},  // canonical NaN to canonical NaN
+    {0x7f800001, 0x7f800001},  // NaN to NaN
+    {0xff800001, 0xff800001},  // NaN to NaN
+    {0, 0}
+  };
+
+  cutlass::NumericConverter<
+    cutlass::tfloat32_t, 
+    float,
+    cutlass::FloatRoundStyle::round_half_ulp_truncate> convert;
+  
+  bool running = true;
+  for (int i = 0; running; ++i) {
+
+    float f32 = reinterpret_cast<float const &>(tests[i].f32_bits);
+
+    cutlass::tfloat32_t tf32 = convert(f32);
+
+    // note, for this test, we must explicitly truncate the low-order bits since they are not 
+    // defined in TF32.
+    if (cutlass::isfinite(tf32)) {
+      tf32.storage &= 0xffffe000;
+    }
+
+    bool passed = (tests[i].expected == tf32.raw());
+    
+    EXPECT_TRUE(passed)
+      << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits 
+      << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << tf32.raw();
+
+    if (!tests[i].f32_bits) {
+      running = false;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Device
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt
index 8597a79f..9de2d56e 100755
--- a/test/unit/epilogue/CMakeLists.txt
+++ b/test/unit/epilogue/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt
index 81b168a2..9b04f775 100644
--- a/test/unit/epilogue/thread/CMakeLists.txt
+++ b/test/unit/epilogue/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu
index cf0d1ea5..6518e987 100644
--- a/test/unit/epilogue/thread/linear_combination.cu
+++ b/test/unit/epilogue/thread/linear_combination.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
index c90b8ad0..89d1be5e 100644
--- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu
+++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt
index 6e10e15c..cb8b7a62 100755
--- a/test/unit/epilogue/threadblock/CMakeLists.txt
+++ b/test/unit/epilogue/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
index de2f8696..76b70f50 100644
--- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
+++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu
index 0d4f9ae5..935a8124 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
index 3dd0fdd6..25cd8933 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
index 0151f1d8..fcc8426c 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
index 530ca8f4..db8e68a3 100644
--- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -758,6 +758,65 @@ TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x128_64x64x16) {
   EXPECT_TRUE(passed);
 }
 
+TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_64x128_64x64x16) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 128, 16>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+  using Element = ElementOutput;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementAccumulator,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
 TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x64_64x32x16) {
 
   //
@@ -2516,6 +2575,249 @@ TEST(SM75_Epilogue_threadblock_epilogue, f16_tensor_op_128x64_64x32x8) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_64x64_32x32x4) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+  int const kElementsPerAccess = 1;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 16>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_128x64_64x32x4) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+  int const kElementsPerAccess = 1;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 16>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_64x128_32x64x4) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+  int const kElementsPerAccess = 1;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 16>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_128x128_32x64x4) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+  int const kElementsPerAccess = 1;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 128, 16>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Epilogue_threadblock_epilogue, vec1_mixed_f16_f32_tensor_op_128x128_64x64x8) {
diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
index 99b7ae11..88fa98cf 100644
--- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
index 3d1fdf0d..24752a1d 100644
--- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
+++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
index 549e6e4d..6e6e96e7 100644
--- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu
+++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
index 7fcdd8e4..40874f7b 100644
--- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h
index c888b9a2..1dc9baa3 100644
--- a/test/unit/epilogue/threadblock/testbed.h
+++ b/test/unit/epilogue/threadblock/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h
index fca543ae..6afa6032 100644
--- a/test/unit/epilogue/threadblock/testbed_planar_complex.h
+++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt
index 89d693e3..dbd7ee65 100644
--- a/test/unit/epilogue/warp/CMakeLists.txt
+++ b/test/unit/epilogue/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
index 4881e5cc..9e94616f 100644
--- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
index a89ec49c..3522c9e9 100644
--- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
index a3a406dc..4931d937 100644
--- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt
index 4d42c000..4ac24571 100644
--- a/test/unit/gemm/CMakeLists.txt
+++ b/test/unit/gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 750a497b..f536b113 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -26,6 +26,64 @@ cutlass_test_unit_add_executable(
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu                              
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu                                         
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+
+  gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu                                     
+  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu                             
+  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+  gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+
+  gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+  gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+
+  gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+  gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+
+  gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
+  gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+
+  gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
+  gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
+  gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
+  gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+  gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+  gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+  gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
+  gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
+  gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
+  gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
+  gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
+  gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
+  gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
+  gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
+  gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
+
+  gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
+  gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+
+  simt_sgemm_nt_sm80.cu
+  simt_sgemm_tn_sm80.cu
+
+  gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
+  gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
+  gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+  gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
+  gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+  gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+  gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+
+  gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
+  gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
+
+  gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
+  gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+
+  gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+  gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+
+  gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
   gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -149,4 +207,5 @@ cutlass_test_unit_add_executable(
   gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
 
   gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+
 )
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
index fb7fe985..fc887bce 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -62,7 +62,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -84,7 +84,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -106,7 +106,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -128,7 +128,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -150,7 +150,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -172,7 +172,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..d8b90727
--- /dev/null
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
@@ -0,0 +1,373 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x1024_32x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 1024>,
+      cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x1024_64x32x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 1024>,
+      cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x1024_32x32x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 512>,
+      cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 512>,
+      cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
index 099c4639..03f0b752 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2, 128, 128, false, 
     cutlass::arch::OpXorPopc
   >;
@@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -166,7 +166,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -197,7 +197,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -228,7 +228,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
index f88a73d9..77777a66 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -62,7 +62,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -84,7 +84,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -106,7 +106,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -128,7 +128,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -150,7 +150,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -172,7 +172,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512) {
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..f6862b0d
--- /dev/null
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -0,0 +1,374 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x1024_64x64x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x128x1024_64x64x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x128x1024_64x64x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x64x1024_64x64x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x256x1024_64x64x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x128x1024_32x64x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 1024>,
+      cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x64x1024_64x32x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 1024>,
+      cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x1024_32x32x1024, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x64x512_64x64x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x256x512_64x64x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 512>,
+      cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 512>,
+      cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
index 1254a19b..b4fb7eba 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2, 128, 128, false, 
     cutlass::arch::OpXorPopc
   >;
@@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -166,7 +166,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -197,7 +197,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
@@ -228,7 +228,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1
           128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, 
           ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..3da9cdbb
--- /dev/null
+++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,353 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor,
+      cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput,
+      cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..b0dbbdc8
--- /dev/null
+++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
@@ -0,0 +1,337 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
new file mode 100644
index 00000000..b15af107
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (half_ulp_truncate)
+//  Instruction operand data type: tfloat32_t (real part) and  tfloat32_t (imaginary part)
+//  Math instruction: MMA.1688.F32.TF32
+//  Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part)
+//  Output data type: complex<float>
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 32x32x16_16x16x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_16x32x16) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) {
+
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) {
+
+  using Element = cutlass::complex<float>;;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) {
+
+  using Element = cutlass::complex<float>;;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
new file mode 100644
index 00000000..cec5ce60
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (round to nearest)
+//  Instruction operand data type: tfloat32_t (real part) and  tfloat32_t (imaginary part)
+//  Math instruction: MMA.1688.F32.TF32
+//  Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part)
+//  Output data type: complex<float>
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 32x32x16_16x16x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_16x32x16) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) {
+
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) {
+
+  using Element = cutlass::complex<float>;;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) {
+
+  using Element = cutlass::complex<float>;;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) {
+
+  using Element = cutlass::complex<float>;;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..c7df15d1
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -0,0 +1,192 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using Element = cutlass::complex<double>; 
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x16_16x32x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x8_16x32x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..5113d2f8
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,246 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_16x32x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_16x32x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_32x32x16) {
+
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_32x32x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..427c1e0e
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x8_32x16x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x16_32x16x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..74fbc1f5
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,299 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x8_32x32x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x8_32x32x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x8_32x32x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x16_16x16x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x16_32x32x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x16_32x32x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index b40f2945..ea3da85d 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index 479004e5..167949d8 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
index 6e42c5de..ae72cade 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..858fd301
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
@@ -0,0 +1,338 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
index 1ea87c43..2dc224ab 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x64x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 67f95987..71f21444 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index 6e07cc8c..bb166506 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -320,7 +320,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -354,7 +354,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -388,7 +388,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
index 6b6d66f5..3e8b9658 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..cd6e48a3
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,337 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index c42771b9..a9f9ea99 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -108,7 +108,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -142,7 +142,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
index d94a7f0d..d797ed55 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..7cf1fad2
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,340 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
index abe55322..be764f52 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index ab15f1c5..25d3e5be 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index 5dd4e2f8..f7c8fb23 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 81ee6d71..27980076 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
index 30ddd06a..b4114ffe 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k, 64x64x64_64x32x32)
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
new file mode 100644
index 00000000..6ca8ada8
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -0,0 +1,82 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k, 128x64x64_64x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
index 3f96597b..64b697af 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
new file mode 100644
index 00000000..cff50705
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
@@ -0,0 +1,338 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64> ,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x64_32x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x64_64x32x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x64_32x32x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x32_32x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x32_64x32x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..8a760b02
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
@@ -0,0 +1,77 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+
+  /*
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  */
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
index dbf02b24..9f2c2c54 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -63,7 +63,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
   
@@ -94,7 +94,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -125,7 +125,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -156,7 +156,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -187,7 +187,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -218,7 +218,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x64x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -249,7 +249,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index 031e2268..aa926061 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index 235c1396..dac3675b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 41824839..74434cc9 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -108,7 +108,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -142,7 +142,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index 38337c64..176112d1 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..47e927d4
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,339 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+ 
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x32_64x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
index d2f58b1c..de19ca00 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -63,7 +63,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -94,7 +94,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -125,7 +125,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -156,7 +156,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -187,7 +187,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -218,7 +218,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -249,7 +249,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index b5ff3b99..0b83c6cb 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
index 3bfe6d8f..a8168424 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x256x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x64x32_6
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -137,7 +137,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x128x32_6
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -170,7 +170,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x64x32_32
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -202,7 +202,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x64x64_32
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -234,7 +234,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x64_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -270,7 +270,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -305,7 +305,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index 7455a1bd..585b1df1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index a2374a61..ab030e5a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 128x128x32_64x64x16_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x16_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -140,7 +140,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x16_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
index 5629dc98..b8fa4dad 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x256x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x64x32_6
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -137,7 +137,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x128x32_6
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -170,7 +170,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x64x32_32
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -202,7 +202,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x64x64_32
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -234,7 +234,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x64_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -270,7 +270,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -305,7 +305,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
index d78d34e6..358aacec 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k, 64x64x64_64x32x32)
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
new file mode 100644
index 00000000..957bcd2a
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -0,0 +1,83 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k, 128x64x64_64x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
index 8463e9e3..7c0f3b40 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
new file mode 100644
index 00000000..972756bb
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
@@ -0,0 +1,339 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x64_32x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x64_64x32x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x64_32x32x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x32_64x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x32_32x64x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x32_64x32x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x32_32x32x32) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
index 68d551a1..14030b1d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x64x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 6a66888f..9a1918db 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index a7c61a1a..51a09194 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -319,7 +319,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -353,7 +353,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -387,7 +387,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index 34859edd..74d64af7 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
index ca63f26d..d4bc720b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -74,7 +74,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x64x32_6
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 64x128x32_6
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -138,7 +138,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 64x64x32_32
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -174,7 +174,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x128x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
@@ -209,7 +209,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x128x32_
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
index f941832d..dd0976d9 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
   
@@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..83c5cd14
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,338 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
index 90e44ee5..6d78dc9a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 05374010..5ea2f9ce 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index 3f922eba..0f773de4 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index c4ab9f4d..54d6229a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -140,7 +140,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index 748f64d1..d123931e 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index 037efb82..b1286acc 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -320,7 +320,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -354,7 +354,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -388,7 +388,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
index d7474d87..5a511540 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..26f41ac2
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,338 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index da55acbd..06498afb 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
index 30bb5583..e377980b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..96f5dcc9
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,338 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x32_64x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
index 8418381c..0f94d589 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
   
@@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) {
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 2d9d4167..2163711b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
new file mode 100644
index 00000000..91095a94
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface using BF16.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    4,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastBF16
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..2108eeb4
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,82 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..64fe313c
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,212 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..63c765c5
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,212 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
index aecee047..99303712 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -63,7 +63,7 @@ using gemm_planar_complex_s884_tn_base = typename cutlass::gemm::kernel::Default
     float,
     float
   >,
-  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
   2,
   cutlass::arch::OpMultiplyAdd
 >::GemmKernel;
@@ -107,7 +107,7 @@ using gemm_planar_complex_s884_nt_base = typename cutlass::gemm::kernel::Default
     float,
     float
   >,
-  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
   2,
   cutlass::arch::OpMultiplyAdd
 >::GemmKernel;
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
new file mode 100644
index 00000000..993b0b9d
--- /dev/null
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
@@ -0,0 +1,217 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-level GEMM API for Planar Complex.
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "testbed_planar_complex.h"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s1688_tn_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm75,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 8>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  2,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s1688_tn : gemm_planar_complex_s1688_tn_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_GemmPlanarComplex_f16t_f16n_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s1688_tn>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s1688_hc_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm75,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 8>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  2,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s1688_hc : gemm_planar_complex_s1688_hc_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_GemmPlanarComplex_f16h_f16c_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s1688_hc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s1688_nt_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm75,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 8>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  2,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s1688_nt : gemm_planar_complex_s1688_nt_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_GemmPlanarComplex_f16n_f16t_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s1688_nt>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s1688_ch_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm75,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 8>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  2,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s1688_ch : gemm_planar_complex_s1688_ch_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_GemmPlanarComplex_f16c_f16h_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s1688_ch>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
new file mode 100644
index 00000000..25fd50cf
--- /dev/null
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
@@ -0,0 +1,216 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-level GEMM API for Planar Complex.
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "testbed_planar_complex.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s16816_tn_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm80,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 16>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  3,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s16816_tn : gemm_planar_complex_s16816_tn_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmPlanarComplex_f16t_f16n_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s16816_tn>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s16816_hc_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm80,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 16>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  3,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s16816_hc : gemm_planar_complex_s16816_hc_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmPlanarComplex_f16h_f16c_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s16816_hc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s16816_nt_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kNone,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm80,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 16>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  3,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s16816_nt : gemm_planar_complex_s16816_nt_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmPlanarComplex_f16n_f16t_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s16816_nt>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using gemm_planar_complex_s16816_ch_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+  cutlass::half_t,
+  cutlass::layout::ColumnMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  cutlass::half_t,
+  cutlass::layout::RowMajor,
+  cutlass::ComplexTransform::kConjugate,
+  8,
+  float,
+  cutlass::layout::RowMajor,
+  float,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm80,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<32, 32, 32>,
+  cutlass::gemm::GemmShape<16, 8, 16>,
+  cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+    float,
+    4,
+    float,
+    float
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+  3,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+struct gemm_planar_complex_s16816_ch : gemm_planar_complex_s16816_ch_base {
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmPlanarComplex_f16c_f16h_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) {
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<gemm_planar_complex_s16816_ch>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
index 832981f9..4cc40681 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -107,7 +107,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -143,7 +143,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -179,7 +179,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..d53e3c07
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
@@ -0,0 +1,213 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "multistage_testbed_interleaved.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    cutlass::int4b_t,
+    cutlass::layout::RowMajorInterleaved<64>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    32,
+    32,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    cutlass::int4b_t,
+    cutlass::layout::RowMajorInterleaved<64>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    32,
+    32,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    cutlass::int4b_t,
+    cutlass::layout::RowMajorInterleaved<64>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    32,
+    32,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    cutlass::int4b_t,
+    cutlass::layout::RowMajorInterleaved<64>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<64>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    32,
+    32,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 64> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
index feb248d2..983dff33 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x128_32x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x128_64x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..8dd54183
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
@@ -0,0 +1,354 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x256_64x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x256_64x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x256_64x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>,
+      cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x64x256_64x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x256x256_64x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x256_32x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 256>,
+      cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x256_64x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 256>,
+      cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x256_32x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x128_64x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x128_64x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x64x128_64x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x256x128_64x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x128_32x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x128_64x32x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
index 22a6d7f4..01a65b32 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x256x128_64x64x128_8x8
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 256x128x128_64x64x128_8x8
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x128x128_64x64x128_8x8
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -168,7 +168,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x128x128_32x64x128_8x8x
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -200,7 +200,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x64x128_64x32x128_8x8x
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -232,7 +232,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
index a5978933..33f3b07a 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x128_32x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x128_64x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..1a3f7dba
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
@@ -0,0 +1,357 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x256_64x64x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x256_64x64x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x256_64x64x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>,
+      cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x256_64x64x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x256_64x64x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x256_32x64x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 256>,
+      cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x256_64x32x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 256>,
+      cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x256_32x32x256, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x128_32x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x128_64x32x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
index 47f959e0..857df472 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x256x128_64x64x128_8x8
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 256x128x128_64x64x128_8x8
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x128x128_64x64x128_8x8
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -168,7 +168,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 64x128x128_32x64x128_8x8x
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -200,7 +200,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x64x128_64x32x128_8x8x
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -232,7 +232,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
index 3766c11e..51d182cd 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x128_64x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
index 5def3a2b..90fe6bcf 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
index 12b4effe..393e68bf 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 32x64x64_16x32x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -137,7 +137,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -171,7 +171,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -239,7 +239,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -273,7 +273,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) {
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..c4900e48
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
@@ -0,0 +1,361 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "multistage_testbed_interleaved.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x256x64_64x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x64x64_64x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::FastLinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate,
+    true
+  >;
+
+  test::gemm::device::MultistageInterleavedTestbed<Gemm, 32> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
index d30a644e..6ac9b71b 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x256x64_64x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 256x128x64_64x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x128x64_64x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x128x64_32x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x64x64_64x32x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x64x64_32x32x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
index 53fcbd23..cc6e4c3a 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
index 15bdacc0..86a678d2 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x64_64x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x64_64x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x64_32x64x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x64_64x32x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..a86dc244
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
@@ -0,0 +1,355 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x128_64x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x128_32x64x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x128_64x32x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x128_32x32x128, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x64_64x64x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x64_64x64x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x64_64x64x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x64_64x64x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x64_32x64x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x64_64x32x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64, {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
index dd88e87f..d53571a2 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -137,7 +137,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -171,7 +171,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
index 4aa799e5..024cba0a 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -59,7 +59,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64,
       cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 
@@ -78,7 +78,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64,
       cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
@@ -96,7 +96,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64,
       cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 
@@ -115,7 +115,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x64_32x64x64,
       cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
   
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 
@@ -146,7 +146,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64,
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -179,7 +179,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..2d6db336
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
@@ -0,0 +1,368 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "multistage_testbed.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x256x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x128_32x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x128_64x32x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x128_32x32x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x256x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x64_32x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
index 34a1f3be..ac5757e0 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16x
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -98,7 +98,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x1
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -130,7 +130,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x16
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -162,7 +162,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x16
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
index a881ca27..93642e64 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -59,7 +59,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64,
       cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
@@ -77,7 +77,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64,
       cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
@@ -95,7 +95,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64,
       cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 
@@ -114,7 +114,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x64_32x64x64,
       cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::FastLinearCombinationClamp<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
   
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
@@ -133,7 +133,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64,
       cutlass::epilogue::thread::LinearCombinationClamp<
           ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   test::gemm::device::Testbed<Gemm> testbed;
 
@@ -154,7 +154,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, {
       cutlass::epilogue::thread::LinearCombinationClamp<
           ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
   test::gemm::device::Testbed<Gemm> testbed;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..197e69b7
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
@@ -0,0 +1,368 @@
+/**************************************************************************************************
+ Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright notice, this list of
+       conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright notice, this list of
+       conditions and the following disclaimer in the documentation and/or other materials
+       provided with the distribution.
+     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+       to endorse or promote products derived from this software without specific prior written
+       permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "multistage_testbed.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x64x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x256x128_64x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x128_32x64x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x128_64x32x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x128_32x32x128, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x64x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x256x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x64_32x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
index d2078582..719e2ac7 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16x
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -99,7 +99,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x1
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -131,7 +131,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x16
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x16
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
index 224c8fbd..e7a01bed 100644
--- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -92,7 +92,7 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     kStages,
     kAlignmentA,
     kAlignmentB,
diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
index c35535dd..39b5f10a 100644
--- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
+++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
index 725b5feb..42e991ed 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -42,7 +42,7 @@
 
 #include "testbed_splitk.h"
 
-// These tests cannot run unless CUDA 10.1 Toolkit or later is used.
+// These operators are assert(0) unless extended PTX is used. 
 #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
index 71b606da..3381f170 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -42,7 +42,7 @@
 
 #include "testbed_splitk.h"
 
-// These tests cannot run unless CUDA 10.2 Toolkit or later is used.
+// These operators are assert(0) unless extended PTX is used. 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..78c6e865
--- /dev/null
+++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,549 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x64x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x128x16_32x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x64x16_64x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..11af8889
--- /dev/null
+++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,549 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x64x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x128x16_32x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x64x16_64x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..a28101f3
--- /dev/null
+++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,487 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x64x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x128x16_32x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x64x16_64x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..a1a0fd7e
--- /dev/null
+++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,550 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x256x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x64x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x128x16_32x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x64x16_64x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    cutlass::tfloat32_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
index 4d31c089..a6316368 100644
--- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -102,7 +102,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -170,7 +170,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1
       ElementAccumulator,
       ElementAccumulator
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..e3244194
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,193 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf32n_cf32t_cf32n_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::ColumnMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf32n_cf32h_cf32n_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::ColumnMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf32h_cf32t_cf32n_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf32h_cf32c_cf32n_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::layout::ColumnMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..301cce78
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64n_cf64t_cf64n_tensor_op_f64_gaussian, 64x64x32_32x32x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::ColumnMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64n_cf64h_cf64n_tensor_op_f64_gaussian, 64x64x32_32x32x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::ColumnMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64h_cf64t_cf64n_tensor_op_f64_gaussian, 64x32x32_32x16x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64h_cf64c_cf64n_tensor_op_f64_gaussian, 64x64x32_32x16x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::layout::ColumnMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..df28110a
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64n_cf64t_cf64n_tensor_op_f64, 64x64x32_32x32x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::ColumnMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64n_cf64h_cf64n_tensor_op_f64, 64x64x32_32x32x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::ColumnMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64h_cf64t_cf64n_tensor_op_f64, 64x64x32_32x32x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::layout::RowMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_cf64h_cf64c_cf64n_tensor_op_f64, 64x64x32_32x32x32) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    Element, 
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::layout::ColumnMajor, 
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element, 
+      1,
+      Element, 
+      Element>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    3,
+    1,
+    1,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..e7b4405a
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+      cutlass::half_t, 
+      cutlass::layout::ColumnMajor, 
+      cutlass::half_t,
+      cutlass::layout::RowMajor, 
+      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 8>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+      2>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+
+TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+      cutlass::half_t, 
+      cutlass::layout::ColumnMajor, 
+      cutlass::half_t,
+      cutlass::layout::RowMajor, 
+      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 8>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+      2,
+      1,
+      1>;
+
+  EXPECT_TRUE(test::gemm::device::TestGemmUniversal<Gemm>(
+    {128, 128, 2}, 
+    cutlass::gemm::GemmUniversalMode::kGemm, 
+    15));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h
new file mode 100644
index 00000000..bdc4b770
--- /dev/null
+++ b/test/unit/gemm/device/multistage_testbed.h
@@ -0,0 +1,251 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct MultistageTestbed {
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute =
+      typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  MultistageTestbed(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 2080)
+      : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(cutlass::TensorView<Element, Layout> view,
+                         cutlass::Distribution::Kind dist_kind, uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope = (cutlass::sizeof_bits<Element>::value == 8) ? 2 : 8;
+      cutlass::reference::host::TensorFillRandomUniform(view, seed, scope,
+                                                        -scope, 0);
+    } else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, -1);
+    } else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    } else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(),
+                                                    view.capacity());
+    } else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(cutlass::gemm::GemmCoord problem_size,
+           ElementCompute alpha = ElementCompute(1),
+           ElementCompute beta = ElementCompute(0)) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA>
+        tensor_A(problem_size.mk());
+
+    cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB>
+        tensor_B(problem_size.kn());
+
+    cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC>
+        tensor_C(problem_size.mn());
+
+    cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC>
+        tensor_D(problem_size.mn());
+
+    cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC>
+        reference_D(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(),
+                                         tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+        problem_size,          tensor_A.device_ref(), tensor_B.device_ref(),
+        tensor_C.device_ref(), tensor_D.device_ref(), {alpha, beta}};
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, alpha, tensor_A.host_ref(), tensor_B.host_ref(), beta,
+        reference_D.host_ref(), ElementAccumulator(0));
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" << problem_size.m() << "x"
+            << problem_size.n() << "x" << problem_size.k() << "_"
+            << Gemm::ThreadblockShape::kM << "x" << Gemm::ThreadblockShape::kN
+            << "x" << Gemm::ThreadblockShape::kK << "_" << Gemm::WarpShape::kM
+            << "x" << Gemm::WarpShape::kN << "x" << Gemm::WarpShape::kK
+            << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file << "problem: " << problem_size << ", alpha: " << alpha
+           << ", beta: " << beta << "\n\n";
+
+      file << "A =\n"
+           << tensor_A.host_view() << "\nB =\n"
+           << tensor_B.host_view() << "\nC =\n"
+           << tensor_C.host_view() << "\n\nReference =\n"
+           << reference_D.host_view() << "\nComputed =\n"
+           << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Runs a set of problem sizes
+  bool run_all() {
+    bool passed = true;
+
+    int problem_size_m[] = {16, 528};
+
+    int problem_size_n[] = {16, 528};
+
+    int problem_size_k[] = {Gemm::InstructionShape::kK,
+                            Gemm::ThreadblockShape::kK * Gemm::kStages +
+                                Gemm::InstructionShape::kK};
+
+    double problem_alpha[] = {1.0};
+
+    // TODO Try non zero beta value after multistaged epilogue is implemented
+    double problem_beta[] = {0.0};
+
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+          for (double alpha : problem_alpha) {
+            for (double beta : problem_beta) {
+              passed =
+                  run({m, n, k}, ElementCompute(alpha), ElementCompute(beta));
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace test
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h
new file mode 100644
index 00000000..c98264de
--- /dev/null
+++ b/test/unit/gemm/device/multistage_testbed_interleaved.h
@@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/host_reorder.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, int InterleavedK>
+struct MultistageInterleavedTestbed {
+
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  MultistageInterleavedTestbed(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename Gemm::ElementA, 
+      typename Gemm::LayoutA> tensor_A(problem_size.mk());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementB, 
+      typename Gemm::LayoutB> tensor_B(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementB, 
+      typename Gemm::LayoutB> tensor_B_reordered(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> tensor_C(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> tensor_D(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> reference_D(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    cutlass::reorder_column<InterleavedK>(
+        tensor_B_reordered.host_ref(), tensor_B.host_ref(), problem_size);
+
+    cutlass::reference::host::TensorCopy(
+      reference_D.host_view(), 
+      tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B_reordered.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B_reordered.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      {alpha, beta}
+    };
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(), 
+      tensor_B.host_ref(), 
+      beta, 
+      reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+    
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D.host_view(), 
+      tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" 
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nB_reordered =\n" << tensor_B_reordered.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Runs a set of problem sizes
+  bool run_all() {
+    bool passed = true;
+
+    int problem_size_m[] = {
+      InterleavedK, 512 + InterleavedK
+    };
+
+    int problem_size_n[] = {
+      InterleavedK, 512 + InterleavedK
+    };
+
+    int problem_size_k[] = {
+      InterleavedK, Gemm::ThreadblockShape::kK * Gemm::kStages + InterleavedK
+    };
+
+    double problem_alpha[] = {
+      1.0
+    };
+
+    double problem_beta[] = {
+      0.0
+    };
+
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+          for (double alpha : problem_alpha) {
+            for (double beta : problem_beta) {
+ 
+              passed = run(
+                {m, n, k}, 
+                ElementCompute(alpha), 
+                ElementCompute(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
index d399b766..5aabfca5 100644
--- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
index 7c192241..c5265ce2 100644
--- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
index 89728ba2..9db96c99 100644
--- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
index 8d4c9fdd..0ac7b4c9 100644
--- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
index 3d5c52ed..1efa9d04 100644
--- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
index 05fa3c94..886c0f9c 100644
--- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
index f0f25300..a43d0afd 100644
--- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
index 38066b94..0175978d 100644
--- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
index 79af9b47..a3aa5ce8 100644
--- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x128x8_32x128x1_8x16_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x64x8_64x64x1_16x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x32x8_128x32x1_16x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x256x8_32x128x1_8x16_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 64x128x8_64x64x1_16x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x64x8_64x64x1_16x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x32x8_128x32x1_16x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x256x8_32x128x1_8x16_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x128x8_64x64x1_16x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x64x8_128x32x1_16x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x256x8_64x64x1_16x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 256x128x8_64x64x1_16x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x256x8_32x64x1_8x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x128x8_64x32x1_8x8_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
index 1401d2fa..d5541939 100644
--- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x128x8_32x128x1_8x16_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x64x8_64x64x1_16x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x32x8_128x32x1_16x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x256x8_32x128x1_8x16_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 64x128x8_64x64x1_16x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x64x8_64x64x1_16x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x32x8_128x32x1_16x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x256x8_32x128x1_8x16_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x128x8_64x64x1_16x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x64x8_128x32x1_16x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x256x8_64x64x1_16x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 256x128x8_64x64x1_16x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x256x8_32x64x1_8x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x128x8_64x32x1_8x8_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
index f1b7a043..526bc01a 100644
--- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x128x8_32x128x1_8x16_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x64x8_64x64x1_16x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x32x8_128x32x1_16x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x256x8_32x128x1_8x16_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 64x128x8_64x64x1_16x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x64x8_64x64x1_16x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x32x8_128x32x1_16x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x256x8_32x128x1_8x16_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x128x8_64x64x1_16x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x64x8_128x32x1_16x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x256x8_64x64x1_16x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 256x128x8_64x64x1_16x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x256x8_32x64x1_8x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x128x8_64x32x1_8x8_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
index 4c1b5913..ad464b30 100644
--- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x128x8_32x128x1_8x16_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x64x8_64x64x1_16x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x32x8_128x32x1_16x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x256x8_32x128x1_8x16_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 64x128x8_64x64x1_16x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x64x8_64x64x1_16x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x32x8_128x32x1_16x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x256x8_32x128x1_8x16_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x128x8_64x64x1_16x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x64x8_128x32x1_16x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x256x8_64x64x1_16x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 256x128x8_64x64x1_16x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x256x8_32x64x1_8x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x128x8_64x32x1_8x8_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
index 59a8dbfe..3db133eb 100644
--- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
index 7ff0c5cd..01f56ea0 100644
--- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
index 392db59e..3692ec2c 100644
--- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
index 3fdc8e27..2254669b 100644
--- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
index d1a8821a..1364a38c 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -69,7 +69,7 @@
       ElementAccumulator,                                             \
       ElementCompute                                                  \
     >,                                                                \
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,       \
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,       \
     2                                                                 \
   >;                                                                  \
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());               \
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
index 0c1449e0..4e4308ff 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_perf, 128x256x32_64x64x8) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -109,7 +109,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_perf, 128x256x32_64x64x8) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -145,7 +145,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_perf, 128x256x32_64x64x8) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -181,7 +181,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_perf, 128x256x32_64x64x8) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
index 9e1c21e9..88c72aee 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -69,7 +69,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -101,7 +101,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -133,7 +133,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -165,7 +165,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -197,7 +197,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -229,7 +229,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -261,7 +261,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -293,7 +293,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
index a81dd4db..0412d751 100644
--- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
index 81c21eda..1adb9b5a 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
new file mode 100644
index 00000000..7d2ab45b
--- /dev/null
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
@@ -0,0 +1,249 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 32x64x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 64x64x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x128x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 64x128x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x64x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x128x8_64x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x256x8_64x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
index 20a2eddb..0c00e560 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
new file mode 100644
index 00000000..00461d2e
--- /dev/null
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
@@ -0,0 +1,249 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 32x64x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 64x64x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x128x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 64x128x8_32x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x64x8_64x32x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x128x8_64x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x256x8_64x64x1) {
+  
+  using Element = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
index 22e846b9..ce7ab9a7 100644
--- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py
index ba6ec3c2..f53dae27 100644
--- a/test/unit/gemm/device/simt_sm50.py
+++ b/test/unit/gemm/device/simt_sm50.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -123,7 +123,7 @@ for precision in precisions:
 
         # write file header
         out.write("/***************************************************************************************************\n"
-" * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.\n"
+" * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.\n"
 " *\n"
 " * Redistribution and use in source and binary forms, with or without modification, are permitted\n"
 " * provided that the following conditions are met:\n"
diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
index 7145b395..7731559a 100644
--- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
index ffe8c0dd..17ea9820 100644
--- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
index 2d4799eb..175c3128 100644
--- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
index ba2447bc..544e626c 100644
--- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
@@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::arch::Sm50,
         ThreadblockShape, WarpShape, InstructionShape,
         EpilogueOutputOp,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
     EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index 57108530..b8c739a7 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h
index 1eff58a2..65c0fdfb 100644
--- a/test/unit/gemm/device/testbed_complex.h
+++ b/test/unit/gemm/device/testbed_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h
index 34d61383..3cbd720b 100644
--- a/test/unit/gemm/device/testbed_interleaved.h
+++ b/test/unit/gemm/device/testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h
index 5642020b..0e4e561e 100644
--- a/test/unit/gemm/device/testbed_planar_complex.h
+++ b/test/unit/gemm/device/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h
new file mode 100644
index 00000000..025fb387
--- /dev/null
+++ b/test/unit/gemm/device/testbed_sanity.h
@@ -0,0 +1,233 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/core_io.h"
+
+#include "testbed.h"
+
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// List of Gemm internal paramters this testbed supports user verification
+//
+enum class ParameterID {
+
+  // Threadblock-level parameters 
+  kSmemASize,
+  kSmemBSize,
+
+  // Warp-level parameters
+  kWarpFragmentASize,
+  kWarpFragmentBSize,
+  kWarpFragmentCSize,
+  kInvalid
+};
+
+struct Reference {
+  ParameterID parameter_id;
+
+  union {
+    int value;
+    
+    struct {
+      int m, n, k;
+    } gemm_shape;
+
+    struct {
+      int row, column;
+    } matrix_shape;
+  };
+
+  std::string error_msg;
+
+  Reference(
+    ParameterID parameter_id_, 
+    int value_=-1, 
+    std::string const &error_msg_="") : parameter_id(parameter_id_), value(value_), error_msg(error_msg_) {} 
+};
+
+
+template <typename Gemm>
+struct TestbedSanity {
+
+  //
+  // Type definitions (All Gemm types top down) 
+  //
+
+  // Unpacking Gemm types in the following order
+  // Kernel-level > Threadblock-level > Warp-level > Instruction-level
+
+  // kernel-level cutlass Gemm
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  //
+  // Threadblock-level gemm types
+  // 
+  using MmaThreadBlock = typename GemmKernel::Mma;
+
+  // Threadblock-level gemm shape covering one stage
+  using ThreadblockShape = typename MmaThreadBlock::Shape;
+
+  // Shared memory size covering all stages
+  using SmemShapeA = typename MmaThreadBlock::Base::SharedStorage::ShapeA;
+  using SmemPaddingA = typename MmaThreadBlock::Policy::SmemPaddingA;
+  using SmemShapeB = typename MmaThreadBlock::Base::SharedStorage::ShapeB;
+  using SmemPaddingB = typename MmaThreadBlock::Policy::SmemPaddingB;
+  
+
+  /// Number of stages 
+  static int const kStages = MmaThreadBlock::Base::kStages;
+
+  /// Number of warp-level GEMM oeprations
+  static int const  kWarpGemmIterations = MmaThreadBlock::kWarpGemmIterations;
+
+
+  //
+  // Warp-level gemm types
+  //
+
+  // Warp-level gemm operator
+  using MmaWarp = typename MmaThreadBlock::Operator;
+
+  // Warp-level gemm shape covering all kgroups
+  using WarpShape = typename MmaWarp::Shape;
+
+  // Warp-level framents holding operands A & B operand and destination C
+  using WarpFragmentA = typename MmaWarp::FragmentA;
+  using WarpFragmentB = typename MmaWarp::FragmentB;
+  using WarpFragmentC = typename MmaWarp::FragmentC;
+
+  //
+  // Instruction-level gemm types
+  //
+
+  // Instruction-level gemm operator
+  using MmaInstruction = typename MmaWarp::Policy::Operator;
+
+  // Instruction shape
+  using InstructionShape = typename MmaInstruction::Shape;
+
+  // Instruction-level framents holding operands A & B operand and destination C
+  using InstructionFragmentA = typename MmaInstruction::FragmentA;
+  using InstructionFragmentB = typename MmaInstruction::FragmentB;
+  using InstructionFragmentC = typename MmaInstruction::FragmentC;
+
+  //
+  // Testbed types
+  //
+
+  // Vector of values holding user provided reference 
+  using ReferenceVector = std::vector<Reference>;
+
+  //
+  // Data members
+  //
+  ReferenceVector references;
+
+  //
+  // Methods
+  //
+
+  TestbedSanity(ReferenceVector const &references_ = ReferenceVector()) : references(references_){ }
+
+  // verify all parameter in ReferenceVector 
+  bool verify() {
+    for(auto ref : references)
+      verify_parameter(ref);
+    return true;
+  }
+
+  // verify parameter of type Reference
+  void verify_parameter(Reference const& ref) {
+    switch(ref.parameter_id) {
+      case ParameterID::kWarpFragmentASize : EXPECT_TRUE(WarpFragmentA::kElements == ref.value) << *this; break;
+      case ParameterID::kWarpFragmentBSize : EXPECT_TRUE(WarpFragmentB::kElements == ref.value) << *this; break;
+      case ParameterID::kWarpFragmentCSize : EXPECT_TRUE(WarpFragmentC::kElements == ref.value) << *this; break;
+    }
+  } 
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//                             Overload output operators for TesbedSanity<Gemm>
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Gemm>
+std::ostream & operator<<(std::ostream &out, TestbedSanity<Gemm> const &test) {
+
+
+  out << "Gemm internal parameters" << std::endl 
+      << "  Threadblock-level parameters:" << std::endl  
+      << "     ThreadblockShape = " << typename TestbedSanity<Gemm>::ThreadblockShape() << std::endl
+      << "     kStages = " << TestbedSanity<Gemm>::kStages << std::endl
+      << "     kWarpGemmIterations = "<< TestbedSanity<Gemm>::kWarpGemmIterations << std::endl    
+      <<"  Shared memory sizes:" << std::endl
+      <<"    SmemPaddingA = " << typename TestbedSanity<Gemm>::SmemPaddingA() << std::endl
+      <<"    SmemPaddingB = " << typename TestbedSanity<Gemm>::SmemPaddingB() << std::endl
+      <<"      SmemShapeA = " << typename TestbedSanity<Gemm>::SmemShapeA() << std::endl
+      <<"      SmemShapeB = " << typename TestbedSanity<Gemm>::SmemShapeB() << std::endl
+      <<"  Warp-level parameters" << std::endl
+      <<"    WarpShape = " << typename TestbedSanity<Gemm>::WarpShape() << std::endl
+      <<"    Fragment sizes:" << std::endl
+      <<"      WarpFragmentA::kElements = " << TestbedSanity<Gemm>::WarpFragmentA::kElements << std::endl
+      <<"      WarpFragmentB::kElements = " << TestbedSanity<Gemm>::WarpFragmentB::kElements << std::endl
+      <<"      WarpFragmentC::kElements = " << TestbedSanity<Gemm>::WarpFragmentC::kElements << std::endl
+      <<"  Instruction-level parameters" << std::endl
+      <<"    InstructionShape = " << typename TestbedSanity<Gemm>::InstructionShape() << std::endl
+      <<"    Fragment sizes:" << std::endl
+      <<"      InstructionFragmentA::kElements = " << TestbedSanity<Gemm>::InstructionFragmentA::kElements << std::endl
+      <<"      InstructionFragmentB::kElements = " << TestbedSanity<Gemm>::InstructionFragmentB::kElements << std::endl
+      <<"      InstructionFragmentC::kElements = " << TestbedSanity<Gemm>::InstructionFragmentC::kElements << std::endl;
+
+  return out;
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h
index c8ae4b4a..792d7392 100644
--- a/test/unit/gemm/device/testbed_splitk.h
+++ b/test/unit/gemm/device/testbed_splitk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index 44503e0a..a83c27cd 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h
index 5a76c3be..9325b40f 100644
--- a/test/unit/gemm/device/testbed_utils.h
+++ b/test/unit/gemm/device/testbed_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -41,6 +41,7 @@ inline char const *to_string(cutlass::Status status) {
     case cutlass::Status::kErrorWorkspaceNull: return "kErrorWorkspaceNull";
     case cutlass::Status::kErrorInternal: return "kErrorInternal";
     case cutlass::Status::kInvalid: return "kInvalid";
+    default: break;
   }
   return "invalid";
 }
diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt
index 11d450c7..48ca1157 100644
--- a/test/unit/gemm/thread/CMakeLists.txt
+++ b/test/unit/gemm/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu
index 969580f5..42659228 100644
--- a/test/unit/gemm/thread/gemm_sm50.cu
+++ b/test/unit/gemm/thread/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu
index 19b84619..b0b9fdb5 100644
--- a/test/unit/gemm/thread/gemm_sm60.cu
+++ b/test/unit/gemm/thread/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu
index f8cbf2b8..f6e7724d 100644
--- a/test/unit/gemm/thread/gemm_sm61.cu
+++ b/test/unit/gemm/thread/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt
index 75f76c92..c5854026 100644
--- a/test/unit/gemm/thread/host/CMakeLists.txt
+++ b/test/unit/gemm/thread/host/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu
index df2d233a..346b80cb 100644
--- a/test/unit/gemm/thread/host/gemm_sm60_host.cu
+++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h
index d2835efe..4d5e441d 100644
--- a/test/unit/gemm/thread/host/testbed_host.h
+++ b/test/unit/gemm/thread/host/testbed_host.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h
index 1b1082a5..bdfb8278 100644
--- a/test/unit/gemm/thread/testbed.h
+++ b/test/unit/gemm/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt
index 7ec75510..f208b9ef 100644
--- a/test/unit/gemm/threadblock/CMakeLists.txt
+++ b/test/unit/gemm/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu
index 79b5ac4e..94ae947b 100644
--- a/test/unit/gemm/threadblock/batched_gemv.cu
+++ b/test/unit/gemm/threadblock/batched_gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu
index c1967e43..1301aeb4 100644
--- a/test/unit/gemm/threadblock/epilogue_workspace.cu
+++ b/test/unit/gemm/threadblock/epilogue_workspace.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
index b5c1a58b..522b029a 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
index b9302ef3..c9c714bc 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
index 5585f23f..e4125eb4 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -231,6 +231,7 @@ TEST(SM75_gemm_threadblock_congruous,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x32_64x64x32_16x8x8) {
   using ElementA = cutlass::half_t;
   using LayoutA = cutlass::layout::RowMajor;
@@ -562,6 +563,7 @@ TEST(SM75_gemm_threadblock_crosswise,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_gemm_threadblock_interleaved, tensor_op_32x32x64_16x16x64_8x8x16) {
   using ElementA = uint8_t;
   using LayoutA = cutlass::layout::ColumnMajorInterleaved<32>;
@@ -1785,4 +1787,337 @@ TEST(SM75_gemm_threadblock_interleaved,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_32x32x512_16x16x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(32, 32, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 16, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x32x512_32x16x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 32, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 16, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_32x64x512_16x32x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(32, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 32, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_32x32x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_128x64x512_64x32x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore component
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x128x512_32x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 128, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise, tensor_op_128x128x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise,
+     multicta_256x256x1536_128x128x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(256, 256, 1536);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_crosswise,
+     multicta_512x256x6144_256x128x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(512, 256, 6144);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 8, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 #endif
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
index 498ca496..8190c50a 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
index 3c1720a1..4fb964c1 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
index e3d900d5..fd2ae356 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
index 5838e4f3..148e34d9 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
+++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
index ba54249d..8c687f88 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
index d1c06083..262269b7 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt
index 600d1d8e..695508fa 100644
--- a/test/unit/gemm/warp/CMakeLists.txt
+++ b/test/unit/gemm/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -27,6 +27,9 @@ cutlass_test_unit_add_executable(
   gemm_sm61.cu
   gemm_sm70.cu
   gemm_sm75.cu
+  gemm_sm80.cu
+  gemm_complex_sm80.cu
+  gemm_gaussian_complex_sm80.cu
   wmma_sm70.cu
   wmma_sm72.cu
   wmma_sm75.cu
diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu
new file mode 100644
index 00000000..3fcd70c8
--- /dev/null
+++ b/test/unit/gemm/warp/gemm_complex_sm80.cu
@@ -0,0 +1,635 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file 
+
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "cutlass/cutlass.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// complex<double> * complex<double> => complex<double>
+// Input data type: complex<double>
+// Math instruction: MMA.884.F64.F64
+// Output data type: complex<double>
+///////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<8, 8, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x32x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x16x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 16, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_nh) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_ct) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<8, 8, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 4> >().run();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// complex<float> * complex<float> => complex<float>
+// Input data type: complex<float>
+// Math instruction: MMA.1688.F32.TF32
+// Output data type: complex<float>
+// Shared memory layout: Congrous
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 8> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 16> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x32x8_16x8x8_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<16, 32, 8> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x16x8_16x16x8_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 16, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<32, 16, 8> >()
+      .run();
+}
+
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_nh) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_ct) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >()
+      .run();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// complex<float> * complex<float> => complex<float>
+// Input data type: complex<float>
+// Math instruction: MMA.1688.F32.TF32
+// Output data type: complex<float>
+// Shared memory layout: Crosswise
+////////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 8> >()
+      .run();
+}
+
+// TEST FAILS crosswise complex<float> TN MMA.1688.F32.TF32 test fails for k = 2*8 = 16
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 16> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x64x8_16x8x8_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<32, 64, 8> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_complex_tensor_op_f32, 64x32x8_16x8x8_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<64, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  
+  using Element = cutlass::complex<float>;
+  using ElementC = cutlass::complex<float>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TransformedTestbedComplex<
+      MmaTensorOp, cutlass::gemm::GemmShape<64, 32, 8> >()
+      .run();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
new file mode 100644
index 00000000..43ad2dfd
--- /dev/null
+++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
@@ -0,0 +1,281 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file 
+
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "cutlass/cutlass.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 8x8x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<8, 8, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x16x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 4> >().run();
+}
+
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x32x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x16x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 16, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_nh) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_ct) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x16x4_8x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 4> >().run();
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu
index f6410d1d..bb4ba5be 100644
--- a/test/unit/gemm/warp/gemm_sm50.cu
+++ b/test/unit/gemm/warp/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu
index cf59d442..4f2f3f15 100644
--- a/test/unit/gemm/warp/gemm_sm60.cu
+++ b/test/unit/gemm/warp/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu
index 98a16046..63e07165 100644
--- a/test/unit/gemm/warp/gemm_sm61.cu
+++ b/test/unit/gemm/warp/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu
index d97effea..16f1427e 100644
--- a/test/unit/gemm/warp/gemm_sm70.cu
+++ b/test/unit/gemm/warp/gemm_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu
index 7c32de4a..144475ca 100644
--- a/test/unit/gemm/warp/gemm_sm75.cu
+++ b/test/unit/gemm/warp/gemm_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -109,6 +109,8 @@ TEST(SM75_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x8) {
       .run();
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x8) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
@@ -317,6 +319,8 @@ TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x8) {
       .run();
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_8x8x16) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu
new file mode 100644
index 00000000..377e760c
--- /dev/null
+++ b/test/unit/gemm/warp/gemm_sm80.cu
@@ -0,0 +1,1782 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file 
+
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x32x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_32x32x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_32x16x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_16x16x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_64x32x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_32x32x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_32x16x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_64x64x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_64x32x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_32x32x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_32x16x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_16x16x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_64x64x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_64x32x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_32x32x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_32x16x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_16x16x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_64x64x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_32x32x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x16_64x64x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x16_32x32x16_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x32_64x64x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x32_32x32x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::tfloat32_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_tn, tf32_round_128x128x32_16x16x32_16x8x8) {
+
+  using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = float;
+  using ElementC = float;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                                     cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_tensor_op_nt, tf32_round_128x128x32_16x16x32_16x8x8) {
+
+  using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = float;
+  using ElementC = float;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+      
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                                     cutlass::gemm::GemmShape<128, 128, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_16x16x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x16x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x32x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x32x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_16x16x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x16x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x32x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x32x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x64x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x32x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_32x32x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_32x16x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_16x16x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_64x64x128_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_64x32x128_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_32x32x128_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_32x16x128_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_16x16x128_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = int8_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_64x64x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_64x32x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_32x32x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_32x16x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_16x16x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 128>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_64x64x256_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 256>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 256> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_64x32x256_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 256>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 256> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_32x32x256_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 256>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 256> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_32x16x256_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 256>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 256> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_16x16x256_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 256>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 256>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 256> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x64x512_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 512> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x32x512_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 512> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x32x512_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 512> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x16x512_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 512> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_16x16x512_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 512>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 512> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x64x1024_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 1024>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 1024> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x32x1024_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 1024>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 1024> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x32x1024_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 1024>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 1024> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x16x1024_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 1024>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 1024> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_16x16x1024_16x8x256) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 1024>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
+  using Element = cutlass::uint1b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 1024>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 1024> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f64, 16x16x4_16x16x4_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<16, 16, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x16x4_32x16x4_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 16, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x32x4_32x32x4_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 32, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x64x4_32x64x4_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 64, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 64, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 16x16x16_16x16x16_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<16, 16, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 32x32x16_32x32x16_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 32, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 64x32x16_64x32x16_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 32, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 32x64x16_32x64x16_8x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 64, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 64, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_16x16x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_32x16x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_32x32x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_64x32x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_64x64x128_16x8x64) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+  using Element = cutlass::int4b_t;
+  using ElementC = int;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 128> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h
index 9560b910..8a565fd9 100644
--- a/test/unit/gemm/warp/testbed.h
+++ b/test/unit/gemm/warp/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -996,7 +996,6 @@ struct TransformedTestbedComplex {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 } // namespace warp
 } // namespace gemm
 } // namespace test
diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu
index d5e1107c..5b9ce63d 100644
--- a/test/unit/gemm/warp/wmma_sm70.cu
+++ b/test/unit/gemm/warp/wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu
index 4f81bbe2..89bfbb59 100644
--- a/test/unit/gemm/warp/wmma_sm72.cu
+++ b/test/unit/gemm/warp/wmma_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu
index a041610d..3818793e 100644
--- a/test/unit/gemm/warp/wmma_sm75.cu
+++ b/test/unit/gemm/warp/wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt
index ab34df0c..29ebdbdd 100644
--- a/test/unit/layout/CMakeLists.txt
+++ b/test/unit/layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu
index 0adddb89..2f8d0ea2 100644
--- a/test/unit/layout/matrix.cu
+++ b/test/unit/layout/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu
index a6b3f7cf..b4a43fb3 100644
--- a/test/unit/layout/tensor.cu
+++ b/test/unit/layout/tensor.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu
index 697f753d..46482b2b 100644
--- a/test/unit/layout/tensor_nhwc.cu
+++ b/test/unit/layout/tensor_nhwc.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt
index 7261da96..668ea35e 100644
--- a/test/unit/nvrtc/CMakeLists.txt
+++ b/test/unit/nvrtc/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h
index e3d493ab..27e99934 100644
--- a/test/unit/nvrtc/cutlass/nvrtc/environment.h
+++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
index c7582351..50087058 100644
--- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h
+++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h
index 50ed027d..38021681 100644
--- a/test/unit/nvrtc/stdlib/stdint.h
+++ b/test/unit/nvrtc/stdlib/stdint.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt
index f1d2b7a1..2e12ccfa 100644
--- a/test/unit/nvrtc/thread/CMakeLists.txt
+++ b/test/unit/nvrtc/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/gemm_nvrtc.cu
index bf57f1d3..785ebcb2 100644
--- a/test/unit/nvrtc/thread/gemm_nvrtc.cu
+++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h
index 69bf81f4..41ba503a 100644
--- a/test/unit/nvrtc/thread/testbed.h
+++ b/test/unit/nvrtc/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index ba1b2a99..7b4f2670 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt
index 9ef27c84..e1983153 100644
--- a/test/unit/reduction/kernel/CMakeLists.txt
+++ b/test/unit/reduction/kernel/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu
index f4a7f07d..b169cb60 100644
--- a/test/unit/reduction/kernel/reduce_splitk.cu
+++ b/test/unit/reduction/kernel/reduce_splitk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h
index c5cbbd58..8e704070 100644
--- a/test/unit/reduction/kernel/reduce_splitk_testbed.h
+++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt
index f42276f7..0641590e 100644
--- a/test/unit/reduction/thread/CMakeLists.txt
+++ b/test/unit/reduction/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu
index ece49345..f71e30f5 100644
--- a/test/unit/reduction/thread/reduction_thread.cu
+++ b/test/unit/reduction/thread/reduction_thread.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h
index 3646e5bf..919839b3 100644
--- a/test/unit/reduction/thread/testbed.h
+++ b/test/unit/reduction/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp
index fc386250..3bb8ac13 100644
--- a/test/unit/test_unit.cpp
+++ b/test/unit/test_unit.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt
index ee865cd4..a7b881ae 100644
--- a/test/unit/transform/CMakeLists.txt
+++ b/test/unit/transform/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt
index e849dc8a..0d5e5c44 100644
--- a/test/unit/transform/threadblock/CMakeLists.txt
+++ b/test/unit/transform/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu
index 70502f73..562c7888 100644
--- a/test/unit/transform/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
index e032383e..e52af8ed 100644
--- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
+++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/test/unit/util/complex.cu b/test/unit/util/complex.cu
index e4867e19..319bbb2a 100644
--- a/test/unit/util/complex.cu
+++ b/test/unit/util/complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 0aa594b0..5c140a9a 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 8c8c5c47..37bb8990 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -22,7 +22,7 @@
 
 include(GNUInstallDirs)
 
-find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
+find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
 
 add_library(cutlass_library_includes INTERFACE)
 add_library(nvidia::cutlass::library::includes ALIAS cutlass_library_includes)
@@ -59,7 +59,7 @@ cutlass_add_library(
   src/operation_table.cu
   src/singleton.cu
   src/util.cu
-  
+
   )
 
 file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py)
diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h
index 1b60eb7d..58c6b30c 100644
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@@ -45,6 +45,9 @@ private:
   /// Host workspace
   static int const kHostWorkspaceSize = (4 << 10);
 
+  /// Provider of operations
+  Provider provider_;
+
   /// CUDA device properties
   cudaDeviceProp device_;
 
@@ -90,6 +93,12 @@ public:
   /// Gets the current CUDA stream
   cudaStream_t get_stream() const;
 
+  /// Gets the current provider
+  Provider get_provider() const;
+
+  /// Sets the provider of operations
+  void set_provider(Provider provider);
+
   /// Gets the device workspace size
   size_t get_workspace_size() const;
 
@@ -149,6 +158,56 @@ public:
     void * ptr_D,                             /// Pointer to D matrix
     int ldd                                   /// Leading dimension of D matrix
   );
+  
+  /// Executes a GEMM computation: D <= alpha * A*B + beta * C.
+  //
+  // Supports batched-strided, batched array or split-K serial or split-K parallel.
+  //
+  Status gemm_universal(
+
+    GemmUniversalMode mode,                   /// indicates the mode in which the kUniversal GEMM is launched
+
+    int M,                                    /// GEMM M dimension
+    int N,                                    /// GEMM N dimension
+    int K,                                    /// GEMM K dimension
+
+    NumericTypeID element_compute,            /// Data type of internal accumulation
+    
+    NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
+
+    void const *alpha,                        /// Pointer to alpha scalar
+
+    NumericTypeID element_A,                  /// Data type of A matrix elements
+    LayoutTypeID layout_A,                    /// Layout of A matrix
+    ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
+
+    void const * ptr_A,                       /// Pointer to A matrix in Global Memory
+    int lda,                                  /// Leading dimension of A matrix
+
+    NumericTypeID element_B,                  /// Data type of B matrix elements
+    LayoutTypeID layout_B,                    /// Layout of B matrix
+    ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
+
+    void const * ptr_B,                       /// Pointer to B matrix in Global Memory
+    int ldb,                                  /// Leading dimension of B matrix
+
+    void const * beta,                        /// Pointer to beta scalar
+
+    NumericTypeID element_C,                  /// Data type of C and D matrices
+
+    void const * ptr_C,                       /// Pointer to C matrix
+    int ldc,                                  /// Leading dimension of C matrix
+
+    void * ptr_D,                             /// Pointer to D matrix
+    int ldd,                                  /// Leading dimension of D matrix
+   
+    int batch_count = 1,                      /// Batch count or number of split-K slices
+ 
+    int64_t batch_stride_A = 0,               /// Batch stride of A operand
+    int64_t batch_stride_B = 0,               /// Batch stride of B operand
+    int64_t batch_stride_C = 0,               /// Batch stride of C operand
+    int64_t batch_stride_D = 0                /// Batch stride of D operand
+  );
 
   /// Planar complex GEMM
   ///
@@ -276,7 +335,6 @@ public:
 using HandlePtr = std::unique_ptr<Handle>;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace library
 } // namespace cutlass
 
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index f58e3a45..d093b611 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -44,6 +44,7 @@
 #include <vector>
 #include <string>
 #include <cstdint>
+#include <stdexcept>
 #include <cuda_runtime.h>
 
 #include "cutlass/cutlass.h"
@@ -93,10 +94,14 @@ enum class NumericTypeID {
   kS32,
   kS64,
   kF16,
+  kBF16, 
+  kTF32,
   kF32,
   kF64,
   kCF16,
+  kCBF16,
   kCF32,
+  kCTF32,
   kCF64,
   kCS4,
   kCS8,
@@ -120,6 +125,7 @@ enum class ComplexTransform {
 
 /// Providers
 enum class Provider {
+  kNone,
   kCUTLASS,
   kReferenceHost,
   kReferenceDevice,
@@ -132,6 +138,8 @@ enum class Provider {
 /// Enumeration indicating the kind of operation
 enum class OperationKind {
   kGemm,
+  kEqGemm,
+  kReduction,
   kInvalid
 };
 
@@ -160,9 +168,11 @@ enum class OpcodeClassID {
 };
 
 enum class MathOperationID {
+  kAdd,
   kMultiplyAdd,
   kMultiplyAddSaturate,
   kMultiplyAddComplex,
+  kMultiplyAddGaussianComplex,
   kXorPopc,
   kInvalid
 };
@@ -180,12 +190,17 @@ enum class GemmKind {
   kInvalid
 };
 
-/// Mode of GEMM
-enum class GemmUniversalMode {
-  kGemm,
-  kGemmSplitKParallel,
-  kBatched,
-  kArray,
+/// Mode of Universal GEMM
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+enum class EpilogueKind {
+  kUnknown,
+  kConversion,
+  kLinearCombination,
+  kLinearCombinationClamp,
+  kLinearCombinationPlanarComplex,
+  kLinearCombinationRelu,
+  kLinearCombinationSigmoid,
   kInvalid
 };
 
@@ -220,6 +235,22 @@ struct MathInstructionDescription {
     opcode_class(opcode_class),
     math_operation(math_operation) {}
 
+  // Equality operator
+  inline
+  bool operator==(MathInstructionDescription const& rhs) const{
+    return (
+      (instruction_shape == rhs.instruction_shape) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (opcode_class == rhs.opcode_class) &&
+      (math_operation == rhs.math_operation));
+  }
+
+  // Inequality operator
+  inline
+  bool operator!=(MathInstructionDescription const& rhs) const {
+    return !(*this == rhs);
+  }
+
 };
 
 /// Structure describing the tiled structure of a GEMM-like computation
@@ -261,6 +292,24 @@ struct TileDescription {
     math_instruction(math_instruction),
     minimum_compute_capability(minimum_compute_capability),
     maximum_compute_capability(maximum_compute_capability) { }
+
+  // Equality operator
+  inline
+  bool operator==(TileDescription const& rhs) const{
+    return (
+      (threadblock_shape == rhs.threadblock_shape) &&
+      (threadblock_stages == rhs.threadblock_stages) &&
+      (warp_count == rhs.warp_count) &&
+      (math_instruction == rhs.math_instruction) &&
+      (minimum_compute_capability == rhs.minimum_compute_capability) &&
+      (maximum_compute_capability == rhs.maximum_compute_capability));
+  }
+
+  // Inequality operator
+  inline
+  bool operator!=(TileDescription const& rhs) const {
+    return !(*this == rhs);
+  }
 };
 
 /// High-level description of an operation
@@ -379,6 +428,20 @@ struct GemmDescription : public OperationDescription {
     transform_B(transform_B) {} 
 };
 
+
+/// Description of all Reduction operations
+struct ReductionDescription : public OperationDescription {
+
+  /// Describes the data type of workspace
+  NumericTypeID element_workspace;
+
+  /// Describes the data type of final output
+  NumericTypeID element_output;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -549,6 +612,42 @@ struct GemmArrayArguments {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Universal GEMM supporting multiple split-K modes, multiple batched modes, real and complex
+//
+// OperationKind: Gemm
+// GemmKind:      Universal
+
+struct GemmUniversalConfiguration {
+
+  GemmUniversalMode mode;
+  gemm::GemmCoord problem_size;
+  int batch_count;
+
+  int64_t lda;
+  int64_t ldb;
+  int64_t ldc;
+  int64_t ldd;
+};
+
+struct GemmUniversalArguments {
+
+  void const *A;
+  void const *B;
+  void const *C;
+  void *D;
+
+  void const *alpha;
+  void const *beta;
+  ScalarPointerMode pointer_mode;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Complex valued GEMM in which real and imaginary parts are separated by a stride
 //
 // OperationKind: Gemm
@@ -648,7 +747,6 @@ struct GemmPlanarComplexArrayArguments {
   ScalarPointerMode pointer_mode;
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace library
diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h
index eaa90b3a..54e51c1f 100644
--- a/tools/library/include/cutlass/library/manifest.h
+++ b/tools/library/include/cutlass/library/manifest.h
@@ -45,6 +45,13 @@ namespace cutlass {
 namespace library {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+// Forward declaration 
+class Manifest;
+
+// init and insert all cutlass gemm and conv2d op in manifest object (procedurally generated using generator.py)
+void initialize_all(Manifest &manifest);         
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// List of operations
 using OperationVector = std::vector<std::unique_ptr<Operation>>;
diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h
index 80ce1e15..3821f65a 100644
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@@ -29,24 +29,28 @@
 */
 
 #pragma once
-
+#include <fstream>
 #include <iosfwd>
 #include <unordered_map>
 #include <algorithm>
 
 #include "cutlass/library/library.h"
 #include "cutlass/library/manifest.h"
-
+#include "cutlass/library/util.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 namespace library {
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for Gemm Functional Maps
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Tuple uniquely identifying functional behavior
+/// Tuple uniquely identifying Gemm functional behavior
 struct GemmFunctionalKey {
 
+  Provider provider;
+  GemmKind gemm_kind;
   NumericTypeID element_compute;
   NumericTypeID element_scalar;
   NumericTypeID element_A;
@@ -63,6 +67,8 @@ struct GemmFunctionalKey {
 
   inline
   GemmFunctionalKey(
+    Provider provider,
+    GemmKind gemm_kind = GemmKind::kGemm,
     NumericTypeID element_compute = NumericTypeID::kF32,
     NumericTypeID element_scalar = NumericTypeID::kF32,
     NumericTypeID element_A = NumericTypeID::kF16,
@@ -73,6 +79,8 @@ struct GemmFunctionalKey {
     ComplexTransform transform_B = ComplexTransform::kNone,
     NumericTypeID element_C = NumericTypeID::kF16
   ):
+    provider(provider),
+    gemm_kind(gemm_kind),
     element_compute(element_compute),
     element_scalar(element_scalar),
     element_A(element_A),
@@ -87,6 +95,8 @@ struct GemmFunctionalKey {
   inline
   bool operator==(GemmFunctionalKey const &rhs) const {
     return 
+      (provider == rhs.provider) &&
+      (gemm_kind == rhs.gemm_kind) &&
       (element_compute == rhs.element_compute) &&
       (element_scalar == rhs.element_scalar) &&
       (element_A == rhs.element_A) &&
@@ -104,6 +114,28 @@ struct GemmFunctionalKey {
   }
 };
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) {
+
+  out << "{\n"
+    << "         provider: " << to_string(k.provider) << "\n"
+    << "        gemm_kind: " << to_string(k.gemm_kind) << "\n"
+    << "  element_compute: " << to_string(k.element_compute) << "\n"
+    << "   element_scalar: " << to_string(k.element_scalar) << "\n"
+    << "        element_A: " << to_string(k.element_A) << "\n"
+    << "         layout_A: " << to_string(k.layout_A) << "\n"
+    << "      transform_A: " << to_string(k.transform_A) << "\n"
+    << "        element_B: " << to_string(k.element_B) << "\n"
+    << "         layout_B: " << to_string(k.layout_B) << "\n"
+    << "      transform_B: " << to_string(k.transform_B) << "\n"
+    << "        element_C: " << to_string(k.element_C) << "\n"
+    << "}";
+
+  return out;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Hash function for GemmFunctionalKey
@@ -120,15 +152,17 @@ struct GemmFunctionalKeyHasher {
     IntHash hash;
 
     return 
-      rotl(hash(int(key.element_compute)), 2) ^
-      rotl(hash(int(key.element_scalar)), 3) ^
-      rotl(hash(int(key.element_A)), 4) ^
-      rotl(hash(int(key.layout_A)), 5) ^
-      rotl(hash(int(key.transform_A)), 6) ^
-      rotl(hash(int(key.element_B)), 7) ^
-      rotl(hash(int(key.layout_B)), 8) ^
-      rotl(hash(int(key.transform_B)), 9) ^
-      rotl(hash(int(key.element_C)), 10);
+      rotl(hash(int(key.provider)), 1) ^ 
+      rotl(hash(int(key.gemm_kind)), 2) ^ 
+      rotl(hash(int(key.element_compute)), 3) ^
+      rotl(hash(int(key.element_scalar)), 4) ^
+      rotl(hash(int(key.element_A)), 5) ^
+      rotl(hash(int(key.layout_A)), 6) ^
+      rotl(hash(int(key.transform_A)), 7) ^
+      rotl(hash(int(key.element_B)), 8) ^
+      rotl(hash(int(key.layout_B)), 9) ^
+      rotl(hash(int(key.transform_B)), 10) ^
+      rotl(hash(int(key.element_C)), 11);
   }
 };
 
@@ -172,6 +206,7 @@ using GemmOperationFunctionalMap = std::unordered_map<
   GemmOperationVectorMap,
   GemmFunctionalKeyHasher
 >;
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -179,15 +214,10 @@ using GemmOperationFunctionalMap = std::unordered_map<
 class OperationTable {
 public:
 
-  /// Map of all operations of type kGemm and gemm_kind of type kGemm
+  /// Map of all operations of type kGemm 
+  // provider (kCUTLASS)
   GemmOperationFunctionalMap gemm_operations;
 
-  /// Map of all operations of type kGemm and gemm_kind of type kPlanarComplex
-  GemmOperationFunctionalMap gemm_planar_complex_operations;
-  
-  /// Map of all operations of type kGemm and gemm_kind of type kPlanarComplexArray
-  GemmOperationFunctionalMap gemm_planar_complex_array_operations;
-
 public:
 
   void append(Manifest const &manifest);
@@ -202,4 +232,3 @@ public:
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k);
-
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index 5ff678e8..526f836b 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -49,6 +49,9 @@ char const *to_string(Provider provider, bool pretty = false);
 /// Parses a Provider enumerant from a string
 template <> Provider from_string<Provider>(std::string const &str);
 
+/// Converts a GemmKind enumerant to a string
+char const *to_string(GemmKind type, bool pretty = false);
+
 /// Converts a NumericType enumerant to a string
 char const *to_string(OperationKind type, bool pretty = false);
 
@@ -111,6 +114,14 @@ char const *to_string(ComplexTransform type, bool pretty = false);
 template <>
 ComplexTransform from_string<ComplexTransform>(std::string const &str);
 
+
+/// Converts a SplitKMode enumerant to a string
+char const *to_string(SplitKMode split_k_mode, bool pretty = false);
+
+/// Converts a SplitKMode enumerant from a string
+template <>
+SplitKMode from_string<SplitKMode>(std::string const &str);
+
 /// Lexical cast from int64_t to string
 std::string lexical_cast(int64_t int_value);
 
diff --git a/tools/library/scripts/gemm_operation.py b/tools/library/scripts/gemm_operation.py
index cc7d35d2..66ecc05e 100644
--- a/tools/library/scripts/gemm_operation.py
+++ b/tools/library/scripts/gemm_operation.py
@@ -23,7 +23,7 @@ from library import *
 class GemmOperation:
   #
   def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
-      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Cohort):
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
 
     self.operation_kind = OperationKind.Gemm
     self.arch = arch
@@ -40,6 +40,7 @@ class GemmOperation:
   def is_complex(self):
     complex_operators = [
       MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex_gaussian
     ]
     return self.tile_description.math_instruction.math_operation in complex_operators
 
@@ -58,6 +59,8 @@ class GemmOperation:
 
   #
   def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
     return ShortDataTypeNames[self.accumulator_type()]
 
 
@@ -259,6 +262,135 @@ class EmitGemmInstance:
 
 ###################################################################################################
 
+#
+class EmitGemmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.gemm_template = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base = 
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},    // transposed B operand
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},    // transposed A operand
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name} : 
+  public ${operation_name}_base { };
+"""
+    self.gemm_template_interleaved = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base = 
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name} : 
+  public ${operation_name}_base { };
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    warp_shape[2] = operation.tile_description.threadblock_shape[2] 
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+
+    if operation.A.layout in transpose_layouts.keys() and \
+      operation.B.layout in transpose_layouts.keys() and \
+      operation.C.layout in transpose_layouts.keys():
+
+      instance_layout_A = transpose_layouts[operation.A.layout]
+      instance_layout_B = transpose_layouts[operation.B.layout]
+      instance_layout_C = transpose_layouts[operation.C.layout]
+
+      gemm_template = self.gemm_template
+    else:
+      instance_layout_A, instance_layout_B, instance_layout_C = \
+        (operation.A.layout, operation.B.layout, operation.C.layout)
+
+      gemm_template = self.gemm_template_interleaved
+    #
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+
+    return SubstituteTemplate(gemm_template, values)
+
+###################################################################################################
+
 #
 class EmitGemmPlanarComplexInstance:
   ''' Responsible for emitting a CUTLASS template definition'''
@@ -282,12 +414,13 @@ class EmitGemmPlanarComplexInstance:
       ${element_accumulator},
       ${element_epilogue}
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     ${stages},
     ${math_operator}
   >::GemmKernel;
 
-  struct ${operation_name} : public Operation_${operation_name} { };
+  struct ${operation_name} : 
+    public Operation_${operation_name} { };
 """
 
   def emit(self, operation):
@@ -355,7 +488,7 @@ class EmitGemmPlanarComplexArrayInstance:
       ${element_accumulator},
       ${element_epilogue}
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     ${stages},
     ${math_operator}
   >::GemmArrayKernel;
@@ -419,12 +552,14 @@ class EmitGemmConfigurationLibrary:
 
     self.instance_emitter = {
       GemmKind.Gemm: EmitGemmInstance,
+      GemmKind.Universal: EmitGemmUniversalInstance,
       GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance,
       GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance
     }
 
     self.gemm_kind_wrappers = {
       GemmKind.Gemm: 'GemmOperation',
+      GemmKind.Universal: 'GemmUniversalOperation',
       GemmKind.PlanarComplex: 'GemmPlanarComplexOperation',
       GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation'
     }
@@ -436,6 +571,13 @@ class EmitGemmConfigurationLibrary:
 ${compile_guard_start}
   manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
 ${compile_guard_end}
+""",
+      GemmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+    >("${operation_name}"));
+${compile_guard_end}
 """,
       GemmKind.PlanarComplex: """
 ${compile_guard_start}
@@ -542,3 +684,4 @@ void initialize_${configuration_name}(Manifest &manifest) {
 
 ###################################################################################################
 ###################################################################################################
+
diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py
index 4b1a483e..29578645 100644
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@@ -18,7 +18,7 @@ from gemm_operation import *
 def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
 
   # by default, use the latest CUDA Toolkit version
-  cuda_version = [10, 2, 82]
+  cuda_version = [11, 0, 132]
 
   # Update cuda_version based on parsed string
   if semantic_ver_string != '':
@@ -36,7 +36,7 @@ def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
 #
 def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
-  swizzling_functor = SwizzlingFunctor.Cohort):
+  swizzling_functor = SwizzlingFunctor.Identity8):
 
   if complex_transforms is None:
     complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
@@ -61,7 +61,7 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
             B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
             C = TensorDescription(element_c, layout[2], alignment_c)
 
-            new_operation = GemmOperation(GemmKind.Gemm, tile_description.minimum_compute_capability, \
+            new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
               tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
 
             manifest.append(new_operation)
@@ -466,6 +466,9 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, args):
 def GenerateSM70(manifest, args):
   GenerateSM70_TensorOp_884(manifest, args)
   GenerateSM70_PlanarComplexTensorOp_884(manifest, args)
+
+  # To limit build size, WMMA GEMMs are disabled for now.
+  #
   #GenerateSM70_WmmaTensorOp_161616(manifest, args)
 
 ###################################################################################################
@@ -621,6 +624,11 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args):
       DataType.s8, DataType.s8, DataType.s32,         \
       OpcodeClass.TensorOp,                           \
       MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 16],                                     \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
   ]
 
   min_cc = 75
@@ -654,7 +662,7 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args):
       data_type_mixed = [
         math_inst.element_a,
         math_inst.element_b,
-        math_inst.element_a,
+        DataType.s8,
         DataType.f32,
       ]
 
@@ -687,6 +695,11 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
       DataType.s8, DataType.s8, DataType.s32,         \
       OpcodeClass.TensorOp,                           \
       MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 16],                                     \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
   ]
 
   min_cc = 75
@@ -712,8 +725,7 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
     ]
     
     operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \
-      SwizzlingFunctor.Identity)
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
     for op in operations:
       op.C.alignment = 8
@@ -736,6 +748,11 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
       DataType.s4, DataType.s4, DataType.s32,         \
       OpcodeClass.TensorOp,                           \
       MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 32],                                     \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
   ]
 
   min_cc = 75
@@ -769,7 +786,7 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
       data_type_mixed = [
         math_inst.element_a,
         math_inst.element_b,
-        math_inst.element_a,
+        DataType.s4,
         DataType.f32,
       ]
 
@@ -804,6 +821,11 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
       DataType.s4, DataType.s4, DataType.s32,         \
       OpcodeClass.TensorOp,                           \
       MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 32],                                     \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
   ]
 
   min_cc = 75
@@ -832,8 +854,7 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
       ]
 
       operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
-        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \
-        SwizzlingFunctor.Identity)
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
       for op in operations:
         op.C.alignment = 16 
@@ -911,6 +932,831 @@ def GenerateSM75(manifest, args):
 ###################################################################################################
 ###################################################################################################
 
+#
+def GenerateSM80_TensorOp_16816(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.bf16, DataType.f32,     \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [8, 4, 2]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64],  3, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  3, [2, 1, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64],  4, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  4, [2, 1, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64],  4, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64],  5, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+    
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+#
+
+#
+def GenerateSM80_PlanarComplexTensorOp_16816(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.bf16, DataType.f32,     \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [8, ]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([ 64, 128, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+    
+    CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, complex_transforms)
+
+#
+def GenerateSM80_TensorOp_16832_TN(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 256, 128], 3, [1, 4, 1], math_inst, min_cc, max_cc),
+    ]
+  
+    data_type = [math_inst.element_a, math_inst.element_b, DataType.s32, DataType.s32]
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s8, DataType.f32]
+  
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+  
+    operations = []
+  
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+      
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+
+#
+
+#
+def GenerateSM80_TensorOp_16832_Interleaved(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+  
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s8, DataType.f32]
+  
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+  
+    for op in operations:
+      op.C.alignment = 8
+  
+#
+
+#
+def GenerateSM80_TensorOp_16864_TN(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+  
+    data_type = [math_inst.element_a, math_inst.element_b, DataType.s32, DataType.s32]
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s4, DataType.f32]
+  
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+  
+    operations = []
+  
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+  
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        op.C.alignment = 8
+      elif op.tile_description.threadblock_shape[1] == 64:
+        op.C.alignment = 8
+      else:
+        op.C.alignment = 4
+#
+
+#
+def GenerateSM80_TensorOp_16864_Interleaved(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+      (LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+  
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s4, DataType.f32]
+    
+    operations = []
+  
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+  
+    for op in operations:
+      op.C.alignment = 16 
+#
+
+#
+def GenerateSM80_TensorOp_168256(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 256],                                   \
+      DataType.b1, DataType.b1, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.xor_popc),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [128,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 512], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 512], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 1024], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 1024], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 1024], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+#
+
+#
+def GenerateSM80_TensorOp_1688(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,     \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  3, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 1, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  4, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  4, [2, 1, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  4, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  5, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    data_type_mixed = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_a,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints)
+
+#
+
+#
+def GenerateSM80_TensorOp_1688_fast_math(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,     \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.f16, DataType.f16, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f16),
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.bf16, DataType.bf16, DataType.f32,       \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_bf16)
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  3, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 1, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  4, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  4, [2, 1, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  4, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  5, [1, 2, 2], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+#
+
+#
+def GenerateSM80_TensorOp_1688_complex(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst = MathInstruction(                  \
+    [16, 8, 8],                                 \
+    DataType.f32, DataType.f32, DataType.f32,   \
+    OpcodeClass.TensorOp,                       \
+    MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  tile_descriptions = [
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [
+    DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+  ]
+
+  alignment_constraints = [1,]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_884(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_884_complex(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+
+#
+def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args):
+
+  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+###################################################################################################
+
+#
+def GenerateSM80_Simt(manifest, args):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+    
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+###################################################################################################
+
+#
+def GenerateSM80(manifest, args):
+
+  GenerateSM80_TensorOp_16816(manifest, args)
+  GenerateSM80_PlanarComplexTensorOp_16816(manifest, args)
+  GenerateSM80_TensorOp_1688(manifest, args)
+  GenerateSM80_TensorOp_1688_fast_math(manifest, args)
+  GenerateSM80_TensorOp_1688_complex(manifest, args)
+  GenerateSM80_TensorOp_884(manifest, args)
+  GenerateSM80_TensorOp_884_complex(manifest, args)
+  GenerateSM80_TensorOp_884_complex_gaussian(manifest, args)
+  GenerateSM80_TensorOp_16832_TN(manifest, args)
+  GenerateSM80_TensorOp_16832_Interleaved(manifest, args)
+  GenerateSM80_TensorOp_16864_TN(manifest, args)
+  GenerateSM80_TensorOp_16864_Interleaved(manifest, args)
+  GenerateSM80_TensorOp_168256(manifest, args)
+  GenerateSM80_Simt(manifest, args)
+#
+
 ###################################################################################################
 
 if __name__ == "__main__":
@@ -920,7 +1766,7 @@ if __name__ == "__main__":
   parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
   parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
   parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
-  parser.add_argument("--architectures", default='50;60;61;75', help="Target compute architectures")
+  parser.add_argument("--architectures", default='53;60;61;70;75;80', help="Target compute architectures")
   parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
   parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
 
@@ -933,6 +1779,8 @@ if __name__ == "__main__":
   GenerateSM61(manifest, args)
   GenerateSM70(manifest, args)
   GenerateSM75(manifest, args)
+  GenerateSM80(manifest, args)
+
   if 'library' in args.generator_target.split(','):
     manifest.emit(GeneratorTarget.Library)
 
diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py
index 71f521e6..bdc43483 100644
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@@ -4,14 +4,32 @@
 # \brief Generates the CUTLASS Library's instances
 #
 
-import enum
 import re
 
 ###################################################################################################
 
+import enum
+
+# The following block implements enum.auto() for Python 3.5 variants that don't include it such
+# as the default 3.5.2 on Ubuntu 16.04.
+# 
+# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
+
+try:
+  from enum import auto as enum_auto
+except ImportError: 
+  __cutlass_library_auto_enum = 0
+  def enum_auto() -> int:
+    global __cutlass_library_auto_enum
+    i = __cutlass_library_auto_enum
+    __cutlass_library_auto_enum += 1
+    return i
+
+###################################################################################################
+
 #
 class GeneratorTarget(enum.Enum):
-  Library = enum.auto()
+  Library = enum_auto()
 #
 GeneratorTargetNames = {
   GeneratorTarget.Library: 'library'
@@ -22,33 +40,37 @@ GeneratorTargetNames = {
 
 #
 class DataType(enum.Enum):
-  b1 = enum.auto()
-  u4 = enum.auto()
-  u8 = enum.auto()
-  u16 = enum.auto()
-  u32 = enum.auto()
-  u64 = enum.auto()
-  s4 = enum.auto()
-  s8 = enum.auto()
-  s16 = enum.auto()
-  s32 = enum.auto()
-  s64 = enum.auto()
-  f16 = enum.auto()
-  f32 = enum.auto()
-  f64 = enum.auto()
-  cf16 = enum.auto()
-  cf32 = enum.auto()
-  cf64 = enum.auto()
-  cs4 = enum.auto()
-  cs8 = enum.auto()
-  cs16 = enum.auto()
-  cs32 = enum.auto()
-  cs64 = enum.auto()
-  cu4 = enum.auto()
-  cu8 = enum.auto()
-  cu16 = enum.auto()
-  cu32 = enum.auto()
-  cu64 = enum.auto()
+  b1 = enum_auto()
+  u4 = enum_auto()
+  u8 = enum_auto()
+  u16 = enum_auto()
+  u32 = enum_auto()
+  u64 = enum_auto()
+  s4 = enum_auto()
+  s8 = enum_auto()
+  s16 = enum_auto()
+  s32 = enum_auto()
+  s64 = enum_auto()
+  f16 = enum_auto()
+  bf16 = enum_auto()
+  f32 = enum_auto()
+  tf32 = enum_auto()
+  f64 = enum_auto()
+  cf16 = enum_auto()
+  cbf16 = enum_auto()
+  cf32 = enum_auto()
+  ctf32 = enum_auto()
+  cf64 = enum_auto()
+  cs4 = enum_auto()
+  cs8 = enum_auto()
+  cs16 = enum_auto()
+  cs32 = enum_auto()
+  cs64 = enum_auto()
+  cu4 = enum_auto()
+  cu8 = enum_auto()
+  cu16 = enum_auto()
+  cu32 = enum_auto()
+  cu64 = enum_auto()
 
 #
 ShortDataTypeNames = {
@@ -74,10 +96,14 @@ DataTypeNames = {
   DataType.s32: "s32",
   DataType.s64: "s64",
   DataType.f16: "f16",
+  DataType.bf16: "bf16",
   DataType.f32: "f32",
+  DataType.tf32: "tf32",
   DataType.f64: "f64",
   DataType.cf16: "cf16",
+  DataType.cbf16: "cbf16",
   DataType.cf32: "cf32",
+  DataType.ctf32: "ctf32",
   DataType.cf64: "cf64",
   DataType.cu4: "cu4",
   DataType.cu8: "cu8",
@@ -104,10 +130,14 @@ DataTypeTag = {
   DataType.s32: "int32_t",
   DataType.s64: "int64_t",
   DataType.f16: "cutlass::half_t",
+  DataType.bf16: "cutlass::bfloat16_t",
   DataType.f32: "float",
+  DataType.tf32: "cutlass::tfloat32_t",
   DataType.f64: "double",
   DataType.cf16: "cutlass::complex<cutlass::half_t>",
+  DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
   DataType.cf32: "cutlass::complex<float>",
+  DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
   DataType.cf64: "cutlass::complex<double>",
   DataType.cu4: "cutlass::complex<cutlass::uint4b_t>",
   DataType.cu8: "cutlass::complex<cutlass::uint8_t>",
@@ -134,10 +164,14 @@ DataTypeSize = {
   DataType.s32: 32,
   DataType.s64: 64,
   DataType.f16: 16,
+  DataType.bf16: 16,
   DataType.f32: 32,
+  DataType.tf32: 32,
   DataType.f64: 64,
   DataType.cf16: 32,
+  DataType.cbf16: 32,
   DataType.cf32: 64,
+  DataType.ctf32: 32,
   DataType.cf64: 128,
   DataType.cu4: 8,
   DataType.cu8: 16,
@@ -155,8 +189,8 @@ DataTypeSize = {
 
 #
 class ComplexTransform(enum.Enum):
-  none = enum.auto()
-  conj = enum.auto()
+  none = enum_auto()
+  conj = enum_auto()
 
 #
 ComplexTransformTag = {
@@ -194,40 +228,47 @@ def get_real_from_complex(complex_type):
 
 #
 class ComplexMultiplyOp(enum.Enum):
-  multiply_add = enum.auto()
-  gaussian = enum.auto()
+  multiply_add = enum_auto()
+  gaussian = enum_auto()
 
 ###################################################################################################
 
 #
 class MathOperation(enum.Enum):
-  multiply_add = enum.auto()
-  multiply_add_saturate = enum.auto()
-  xor_popc = enum.auto()
-  multiply_add_complex = enum.auto()
+  multiply_add = enum_auto()
+  multiply_add_saturate = enum_auto()
+  xor_popc = enum_auto()
+  multiply_add_fast_bf16 = enum_auto()
+  multiply_add_fast_f16 = enum_auto()
+  multiply_add_complex = enum_auto()
+  multiply_add_complex_gaussian = enum_auto()
+
 #
 MathOperationTag = {
   MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', 
   MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
   MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
+  MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
+  MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
   MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
+  MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
 }
 
 ###################################################################################################
 
 #
 class LayoutType(enum.Enum):
-  ColumnMajor = enum.auto()
-  RowMajor = enum.auto()
-  ColumnMajorInterleaved32 = enum.auto()
-  RowMajorInterleaved32 = enum.auto()
-  ColumnMajorInterleaved64 = enum.auto()
-  RowMajorInterleaved64 = enum.auto()
-  TensorNHWC = enum.auto()
-  TensorNCHW = enum.auto()
-  TensorNGHWC = enum.auto()
-  TensorNCxHW32 = enum.auto()
-  TensorNCxHW64 = enum.auto()
+  ColumnMajor = enum_auto()
+  RowMajor = enum_auto()
+  ColumnMajorInterleaved32 = enum_auto()
+  RowMajorInterleaved32 = enum_auto()
+  ColumnMajorInterleaved64 = enum_auto()
+  RowMajorInterleaved64 = enum_auto()
+  TensorNHWC = enum_auto()
+  TensorNCHW = enum_auto()
+  TensorNGHWC = enum_auto()
+  TensorNCxHW32 = enum_auto()
+  TensorNCxHW64 = enum_auto()
 
 #
 LayoutTag = {
@@ -282,9 +323,9 @@ ShortComplexLayoutNames = {
 
 #
 class OpcodeClass(enum.Enum):
-  Simt = enum.auto()
-  TensorOp = enum.auto()
-  WmmaTensorOp = enum.auto()
+  Simt = enum_auto()
+  TensorOp = enum_auto()
+  WmmaTensorOp = enum_auto()
 
 OpcodeClassNames = {
   OpcodeClass.Simt: 'simt',
@@ -302,7 +343,7 @@ OpcodeClassTag = {
 
 #
 class OperationKind(enum.Enum):
-  Gemm = enum.auto()
+  Gemm = enum_auto()
 #
 OperationKindNames = {
   OperationKind.Gemm: 'gemm'
@@ -310,7 +351,7 @@ OperationKindNames = {
 
 # 
 class Target(enum.Enum):
-  library = enum.auto()
+  library = enum_auto()
 
 ArchitectureNames = {
   50: 'maxwell',
@@ -318,6 +359,7 @@ ArchitectureNames = {
   61: 'pascal',
   70: 'volta',
   75: 'turing',
+  80: 'ampere',
 }
 
 ###################################################################################################
@@ -340,27 +382,27 @@ def SubstituteTemplate(template, values):
 
 #
 class GemmKind(enum.Enum):
-  Gemm = enum.auto()
-  Batched = enum.auto()
-  Array = enum.auto()
-  Universal = enum.auto()
-  PlanarComplex = enum.auto()
-  PlanarComplexArray = enum.auto()
+  Gemm = enum_auto()
+  Batched = enum_auto()
+  Array = enum_auto()
+  Universal = enum_auto()
+  PlanarComplex = enum_auto()
+  PlanarComplexArray = enum_auto()
 
 #
 GemmKindNames = {
   GemmKind.Gemm: "gemm",
   GemmKind.Batched: "gemm_batched",
   GemmKind.Array: "gemm_array",
-  GemmKind.Universal: "gemm_universal",
+  GemmKind.Universal: "gemm",
   GemmKind.PlanarComplex: "gemm_planar_complex",
   GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
 }
 
 #
 class EpilogueFunctor(enum.Enum):
-  LinearCombination = enum.auto()
-  LinearCombinationClamp = enum.auto()
+  LinearCombination = enum_auto()
+  LinearCombinationClamp = enum_auto()
 
 #
 EpilogueFunctorTag = {
@@ -370,13 +412,17 @@ EpilogueFunctorTag = {
 
 #
 class SwizzlingFunctor(enum.Enum):
-  Cohort = enum.auto()
-  Identity = enum.auto()
+  Identity1 = enum_auto()
+  Identity2 = enum_auto()
+  Identity4 = enum_auto()
+  Identity8 = enum_auto()
 
 #
 SwizzlingFunctorTag = {
-  SwizzlingFunctor.Cohort: 'cutlass::gemm::threadblock::GemmCohortThreadblockSwizzle<${layout_a}, ${layout_b}>',
-  SwizzlingFunctor.Identity: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle',
+  SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
+  SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
+  SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
+  SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
 }
 ###################################################################################################
 
diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py
index 38182a1b..756ddc72 100644
--- a/tools/library/scripts/manifest.py
+++ b/tools/library/scripts/manifest.py
@@ -127,7 +127,7 @@ class Manifest:
     if args.kernels == 'all':
       self.kernel_names = []
     else:
-      self.kernel_names = args.kernels.split(',')
+      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
 
     self.operation_count = 0
     self.operations_by_name = {}
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index 102c549a..23781b25 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -29,13 +29,14 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
 
 #include "cutlass/gemm/device/gemm.h"
 #include "cutlass/gemm/device/gemm_complex.h"
 #include "cutlass/gemm/device/gemm_batched.h"
 #include "cutlass/gemm/device/gemm_array.h"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
 
 #include "cutlass/library/library.h"
 #include "library_internal.h"
@@ -104,10 +105,10 @@ public:
       MathOperationMap<typename Operator::Operator>::kId;
 
     description_.tile_description.minimum_compute_capability = 
-      ArchMap<typename Operator::ArchTag>::kMin;
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
 
     description_.tile_description.maximum_compute_capability = 
-      ArchMap<typename Operator::ArchTag>::kMax;
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
     
     description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
     description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
@@ -698,6 +699,201 @@ public:
   }
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmUniversalOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmUniversalOperation(char const *name = "unknown_gemm"): 
+    GemmOperationBase<Operator_>(name) {
+
+    this->description_.gemm_kind = GemmKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    GemmUniversalConfiguration const *configuration) {
+
+    operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    GemmUniversalArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    GemmUniversalConfiguration const *configuration = 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+
+    GemmUniversalArguments const *arguments = 
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<GemmUniversalArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+    
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Operator_>
diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu
index b2345932..bdddf2d7 100644
--- a/tools/library/src/handle.cu
+++ b/tools/library/src/handle.cu
@@ -26,7 +26,7 @@
 /*! \file
     \brief CUTLASS Library handle.
 */
-
+#include <iostream> 
 #include <stdexcept>
 #include <cstdint>
 
@@ -43,7 +43,8 @@ namespace library {
 Handle::Handle(
   cudaStream_t stream, 
   size_t workspace_size
-): 
+):
+  provider_(Provider::kCUTLASS), 
   stream_(stream), 
   workspace_(nullptr), 
   workspace_size_(0), 
@@ -95,6 +96,7 @@ Handle::Handle(Handle && handle) {
 /// Move assignment operator
 Handle & Handle::operator=(Handle && handle) {
 
+  provider_ = handle.provider_;
   device_ = handle.device_;
   workspace_size_ = handle.workspace_size_;
   workspace_ = handle.workspace_;
@@ -121,6 +123,16 @@ cudaStream_t Handle::get_stream() const {
   return stream_;
 }
 
+/// Gets the current provider
+Provider Handle::get_provider() const {
+  return provider_;
+}
+
+/// Sets the provider of operations
+void Handle::set_provider(Provider provider) {
+  provider_ = provider;
+}
+
 /// Gets the device workspace size
 size_t Handle::get_workspace_size() const {
   return workspace_size_;
@@ -351,6 +363,8 @@ Status Handle::gemm(
   //
 
   GemmFunctionalKey key(
+    provider_,
+    GemmKind::kGemm,
     element_compute,
     element_scalar,
     element_A,
@@ -457,6 +471,188 @@ Status Handle::gemm(
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Executes a GEMM computation: D <= alpha * A*B + beta * C.
+//
+// Supports batched-strided, batched array or split-K serial or split-K parallel.
+//
+Status Handle::gemm_universal(
+
+  GemmUniversalMode mode,                   /// indicates the mode in which the kUniversal GEMM is launched
+
+  int M,                                    /// GEMM M dimension
+  int N,                                    /// GEMM N dimension
+  int K,                                    /// GEMM K dimension
+
+  NumericTypeID element_compute,            /// Data type of internal accumulation
+
+  NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
+
+  void const *alpha,                        /// Pointer to alpha scalar
+
+  NumericTypeID element_A,                  /// Data type of A matrix elements
+  LayoutTypeID layout_A,                    /// Layout of A matrix
+  ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
+
+  void const * ptr_A,                       /// Pointer to A matrix in Global Memory
+  int lda,                                  /// Leading dimension of A matrix
+
+  NumericTypeID element_B,                  /// Data type of B matrix elements
+  LayoutTypeID layout_B,                    /// Layout of B matrix
+  ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
+
+  void const * ptr_B,                       /// Pointer to B matrix in Global Memory
+  int ldb,                                  /// Leading dimension of B matrix
+
+  void const * beta,                        /// Pointer to beta scalar
+
+  NumericTypeID element_C,                  /// Data type of C and D matrices
+
+  void const * ptr_C,                       /// Pointer to C matrix
+  int ldc,                                  /// Leading dimension of C matrix
+
+  void * ptr_D,                             /// Pointer to D matrix
+  int ldd,                                  /// Leading dimension of D matrix
+
+  int batch_count,                          /// Batch count or number of split-K slices
+
+  int64_t batch_stride_A,                   /// Batch stride of A operand
+  int64_t batch_stride_B,                   /// Batch stride of B operand
+  int64_t batch_stride_C,                   /// Batch stride of C operand
+  int64_t batch_stride_D                    /// Batch stride of D operand
+) {
+  
+  //
+  // Find the operation
+  //
+
+  GemmFunctionalKey key(
+    provider_,
+    GemmKind::kUniversal,
+    element_compute,
+    element_scalar,
+    element_A,
+    layout_A,
+    transform_A,
+    element_B,
+    layout_B,
+    transform_B,
+    element_C
+  );
+
+  auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
+
+  if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
+    return cutlass::Status::kErrorNotSupported;
+  }
+  
+  if (operators_it->second.empty()) {
+    return cutlass::Status::kErrorNotSupported;
+  }
+
+  //
+  // Compute the largest alignment restriction the kernel can satisfy.
+  //
+
+  // Maximum alignment expectation among all kernels (in units of bytes)
+  int const kMaximumAlignmentSize = 16;
+
+  void const *ptr_A_check = ptr_A;
+  void const *ptr_B_check = ptr_B;
+  void const *ptr_C_check = ptr_C;
+  void *      ptr_D_check = ptr_D;
+
+  // Ignore alignment of pointers to pointers. We can't check this from the host,
+  // as each batch index has its own pointer in device memory.
+  if (mode == GemmUniversalMode::kArray) {
+    ptr_A_check = nullptr; 
+    ptr_B_check = nullptr; 
+    ptr_C_check = nullptr; 
+    ptr_D_check = nullptr; 
+  }
+
+  int alignment = gemm_problem_alignment(
+    M, N, K, 
+    element_A, ptr_A_check, lda, 0,
+    element_B, ptr_B_check, ldb, 0,
+    element_C, ptr_C_check, ldc, 0,
+    ptr_D_check, ldd, 0, kMaximumAlignmentSize
+  );
+
+  //
+  // Find the best kernel in descending order of preference.
+  //
+
+  GemmPreferenceKey preference_key(compute_capability(), alignment);
+
+  Operation const *operation = find_gemm_operation(operators_it, preference_key);
+
+  if (!operation) {
+    return cutlass::Status::kErrorNotSupported;
+  }
+
+  last_operation_ = operation;
+
+  //
+  // Configure operation
+  //
+
+  GemmUniversalConfiguration configuration{
+    mode,
+    {M, N, K},
+    batch_count,
+    lda,
+    ldb,
+    ldc,
+    ldd
+  };
+
+  // Query host work space size
+  uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration);
+
+  if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) {
+    return cutlass::Status::kErrorNotSupported;
+  }
+
+  char host_workspace[kHostWorkspaceSize];
+
+  // Query device workspace size
+  uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration);
+
+  if (uint64_t(workspace_size_) < device_workspace_size_needed) {
+    return cutlass::Status::kErrorNotSupported;
+  }
+
+  // Initialize host and device workspaces
+  Status status = operation->initialize(
+    &configuration,
+    host_workspace,
+    workspace_,
+    stream_);
+
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  // Run the operator
+  GemmUniversalArguments arguments{
+    ptr_A,
+    ptr_B,
+    ptr_C,
+    ptr_D,
+    alpha,
+    beta,
+    scalar_pointer_mode_,
+    batch_stride_A,
+    batch_stride_B,
+    batch_stride_C,
+    batch_stride_D
+  };
+
+  return operation->run(&arguments, host_workspace, workspace_, stream_);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Planar complex GEMM
 Status Handle::gemm_planar_complex(
 
@@ -522,6 +718,8 @@ Status Handle::gemm_planar_complex(
   //
 
   GemmFunctionalKey key(
+    provider_,
+    GemmKind::kPlanarComplex,
     element_compute,
     element_scalar,
     element_A,
@@ -533,9 +731,9 @@ Status Handle::gemm_planar_complex(
     element_C
   );
 
-  auto operators_it = Singleton::get().operation_table.gemm_planar_complex_operations.find(key);
+  auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
 
-  if (operators_it == Singleton::get().operation_table.gemm_planar_complex_operations.end()) {
+  if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
     return cutlass::Status::kErrorNotSupported;
   }
   
@@ -714,6 +912,8 @@ Status Handle::gemm_planar_complex_array(
   //
 
   GemmFunctionalKey key(
+    provider_,
+    GemmKind::kPlanarComplexArray,
     element_compute,
     element_scalar,
     element_A,
@@ -725,9 +925,9 @@ Status Handle::gemm_planar_complex_array(
     element_C
   );
 
-  auto operators_it = Singleton::get().operation_table.gemm_planar_complex_array_operations.find(key);
+  auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
 
-  if (operators_it == Singleton::get().operation_table.gemm_planar_complex_array_operations.end()) {
+  if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
     return cutlass::Status::kErrorNotSupported;
   }
   
@@ -837,7 +1037,6 @@ Status Handle::gemm_planar_complex_array(
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace library
 } // namespace cutlass
 
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index 252d474e..73847b11 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -125,6 +125,14 @@ template <> struct NumericTypeMap<cutlass::complex<double> > {
   static NumericTypeID const kId = NumericTypeID::kCF64;
 };
 
+template <> struct NumericTypeMap<cutlass::bfloat16_t> {
+  static NumericTypeID const kId = NumericTypeID::kBF16;
+};
+
+template <> struct NumericTypeMap<cutlass::tfloat32_t> {
+  static NumericTypeID const kId = NumericTypeID::kTF32;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename T> struct MathOperationMap {
@@ -143,6 +151,10 @@ template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddComplex> {
   static MathOperationID const kId = MathOperationID::kMultiplyAddComplex;
 };
 
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddGaussianComplex> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddGaussianComplex;
+};
+
 template <> struct MathOperationMap<cutlass::arch::OpXorPopc> {
   static MathOperationID const kId = MathOperationID::kXorPopc;
 };
@@ -217,33 +229,43 @@ template <> struct ComplexTransformMap<cutlass::ComplexTransform::kConjugate> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename T> struct ArchMap;
+template <typename ArchTag, typename OperatorClass> struct ArchMap;
 
-template <> struct ArchMap<arch::Sm50> {
+template <> struct ArchMap<arch::Sm50, arch::OpClassSimt> {
   static int const kMin = 50;
   static int const kMax = 1024;
 };
 
-template <> struct ArchMap<arch::Sm60> {
+template <> struct ArchMap<arch::Sm60, arch::OpClassSimt> {
   static int const kMin = 60;
   static int const kMax = 1024;
 };
 
-template <> struct ArchMap<arch::Sm61> {
+template <> struct ArchMap<arch::Sm61, arch::OpClassSimt> {
   static int const kMin = 61;
   static int const kMax = 1024;
 };
 
-template <> struct ArchMap<arch::Sm70> {
+template <> struct ArchMap<arch::Sm70, arch::OpClassWmmaTensorOp> {
+  static int const kMin = 70;
+  static int const kMax = 1024;
+};
+
+template <> struct ArchMap<arch::Sm70, arch::OpClassTensorOp> {
   static int const kMin = 70;
   static int const kMax = 75;
 };
 
-template <> struct ArchMap<arch::Sm75> {
+template <typename OperatorClass> struct ArchMap<arch::Sm75, OperatorClass> {
   static int const kMin = 75;
   static int const kMax = 1024;
 };
 
+template <typename OperatorClass> struct ArchMap<arch::Sm80, OperatorClass> {
+  static int const kMin = 80;
+  static int const kMax = 1024;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Element, typename Layout>
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index ca6d1781..d4e8a884 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -37,11 +37,6 @@ namespace library {
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// init and insert all cutlass op in manifest object (procedurally generated using generator.py)
-void initialize_all(Manifest &manifest);         
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 /// Top-level initialization
 Status Manifest::initialize() {
 
@@ -49,13 +44,8 @@ Status Manifest::initialize() {
     operations_.clear();
   }
 
-  switch(provider_) {
-    case Provider::kCUTLASS: 
-      initialize_all(*this); break;  
-
-    default: 
-      break;
-  }
+  // initialize procedurally generated cutlass op in manifest object
+  initialize_all(*this);
 
   return Status::kSuccess;
 }
diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu
index 8fb0fe63..64e4f264 100644
--- a/tools/library/src/operation_table.cu
+++ b/tools/library/src/operation_table.cu
@@ -28,30 +28,7 @@
         instances may be queried.
 */
 
-#include <fstream>
-
-#include "cutlass/library/library.h"
 #include "cutlass/library/operation_table.h"
-#include "cutlass/library/util.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) {
-
-  out << "{\n"
-    << "  element_compute: " << to_string(k.element_compute) << "\n"
-    << "   element_scalar: " << to_string(k.element_scalar) << "\n"
-    << "        element_A: " << to_string(k.element_A) << "\n"
-    << "         layout_A: " << to_string(k.layout_A) << "\n"
-    << "      transform_A: " << to_string(k.transform_A) << "\n"
-    << "        element_B: " << to_string(k.element_B) << "\n"
-    << "         layout_B: " << to_string(k.layout_B) << "\n"
-    << "      transform_B: " << to_string(k.transform_B) << "\n"
-    << "        element_C: " << to_string(k.element_C) << "\n"
-    << "}";
-
-  return out;
-}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -67,85 +44,38 @@ void OperationTable::append(Manifest const &manifest) {
 
     OperationDescription const &desc = operation->description();
 
+    // insert all gemm operation into operation table
     if (desc.kind == OperationKind::kGemm) {
       GemmDescription const &gemm_desc = static_cast<GemmDescription const &>(desc);
     
-      if (gemm_desc.gemm_kind == GemmKind::kGemm) {
 
-        GemmFunctionalKey functional_key(
-          gemm_desc.tile_description.math_instruction.element_accumulator,
-          gemm_desc.element_epilogue,
-          gemm_desc.A.element,
-          gemm_desc.A.layout,
-          gemm_desc.transform_A,
-          gemm_desc.B.element,
-          gemm_desc.B.layout,
-          gemm_desc.transform_B,
-          gemm_desc.C.element
-        );
+      GemmFunctionalKey functional_key(
+        gemm_desc.provider,
+        gemm_desc.gemm_kind,
+        gemm_desc.tile_description.math_instruction.element_accumulator,
+        gemm_desc.element_epilogue,
+        gemm_desc.A.element,
+        gemm_desc.A.layout,
+        gemm_desc.transform_A,
+        gemm_desc.B.element,
+        gemm_desc.B.layout,
+        gemm_desc.transform_B,
+        gemm_desc.C.element
+      );
 
-        Operation const *op = operation.get();
+      Operation const *op = operation.get();
 
-        int cc = gemm_desc.tile_description.minimum_compute_capability;
+      int cc = gemm_desc.tile_description.minimum_compute_capability;
         
-        int alignment = std::max(std::max(
-          gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment);
+      int alignment = std::max(std::max(
+        gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment);
 
-        GemmPreferenceKey preference_key(cc, alignment);
+      GemmPreferenceKey preference_key(cc, alignment);
 
-        gemm_operations[functional_key][preference_key].push_back(op);
-      }
-      else if (gemm_desc.gemm_kind == GemmKind::kPlanarComplex) {
-
-        GemmFunctionalKey functional_key(
-          gemm_desc.tile_description.math_instruction.element_accumulator,
-          gemm_desc.element_epilogue,
-          gemm_desc.A.element,
-          gemm_desc.A.layout,
-          gemm_desc.transform_A,
-          gemm_desc.B.element,
-          gemm_desc.B.layout,
-          gemm_desc.transform_B,
-          gemm_desc.C.element
-        );
-
-        Operation const *op = operation.get();
-
-        int cc = gemm_desc.tile_description.minimum_compute_capability;
-        
-        int alignment = std::max(std::max(
-          gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment);
-
-        GemmPreferenceKey preference_key(cc, alignment);
-
-        gemm_planar_complex_operations[functional_key][preference_key].push_back(op);
-      }
-      else if (gemm_desc.gemm_kind == GemmKind::kPlanarComplexArray) {
-
-        GemmFunctionalKey functional_key(
-          gemm_desc.tile_description.math_instruction.element_accumulator,
-          gemm_desc.element_epilogue,
-          gemm_desc.A.element,
-          gemm_desc.A.layout,
-          gemm_desc.transform_A,
-          gemm_desc.B.element,
-          gemm_desc.B.layout,
-          gemm_desc.transform_B,
-          gemm_desc.C.element
-        );
-
-        Operation const *op = operation.get();
-
-        int cc = gemm_desc.tile_description.minimum_compute_capability;
-        
-        int alignment = std::max(std::max(
-          gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment);
-
-        GemmPreferenceKey preference_key(cc, alignment);
-
-        gemm_planar_complex_array_operations[functional_key][preference_key].push_back(op);
-      }
+      gemm_operations[functional_key][preference_key].push_back(op);
     }
+
+
   }
 
 }
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index 12757292..427f0a2c 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -45,6 +45,7 @@ static struct {
   Provider enumerant;
 }
 Provider_enumerants[] = {
+  {"none", "None", Provider::kNone},
   {"cutlass", "CUTLASS", Provider::kCUTLASS},
   {"host", "reference_host", Provider::kReferenceHost},
   {"device", "reference_device", Provider::kReferenceDevice},
@@ -83,6 +84,38 @@ Provider from_string<Provider>(std::string const &str) {
 }
 
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  GemmKind enumerant;
+}
+GemmKind_enumerants[] = {
+  {"gemm", "<Gemm>", GemmKind::kGemm},
+  {"batched", "<Batched>", GemmKind::kBatched},
+  {"array", "<Array>", GemmKind::kArray},
+  {"universal", "<Universal>", GemmKind::kUniversal},
+  {"planar_complex", "<PlanarComplex>", GemmKind::kPlanarComplex},
+  {"planar_complex_array", "<PlanarComplexArray>", GemmKind::kPlanarComplexArray},
+};
+
+/// Converts a ConvKind enumerant to a string
+char const *to_string(GemmKind type, bool pretty) {
+
+  for (auto const & possible : GemmKind_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -92,6 +125,7 @@ static struct {
   OperationKind enumerant;
 }
 OperationKind_enumerants[] = {
+  {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, 
   {"gemm", "Gemm", OperationKind::kGemm},               
 };
 
@@ -194,10 +228,14 @@ NumericTypeID_enumerants[] = {
   {"s32", "S32", NumericTypeID::kS32},
   {"s64", "S64", NumericTypeID::kS64},
   {"f16", "F16", NumericTypeID::kF16},
+  {"bf16", "BF16", NumericTypeID::kBF16},
   {"f32", "F32", NumericTypeID::kF32},
+  {"tf32", "TF32", NumericTypeID::kTF32},
   {"f64", "F64", NumericTypeID::kF64},
   {"cf16", "CF16", NumericTypeID::kCF16},
+  {"cbf16", "CBF16", NumericTypeID::kCBF16},
   {"cf32", "CF32", NumericTypeID::kCF32},
+  {"ctf32", "CTF32", NumericTypeID::kCTF32},
   {"cf64", "CF64", NumericTypeID::kCF64},
   {"cu4", "CU4", NumericTypeID::kCU4},
   {"cu8", "CU8", NumericTypeID::kCU8},
@@ -249,10 +287,14 @@ NumericTypeID from_string<NumericTypeID>(std::string const &str) {
 int sizeof_bits(NumericTypeID type) {
   switch (type) {
     case NumericTypeID::kF16: return 16;
+    case NumericTypeID::kBF16: return 16;
+    case NumericTypeID::kTF32: return 32;
     case NumericTypeID::kF32: return 32;
     case NumericTypeID::kF64: return 64;
     case NumericTypeID::kCF16: return 32;
+    case NumericTypeID::kCBF16: return 32;
     case NumericTypeID::kCF32: return 64;
+    case NumericTypeID::kCTF32: return 64;
     case NumericTypeID::kCF64: return 128;
     case NumericTypeID::kS4: return 4;
     case NumericTypeID::kS8: return 8;
@@ -276,6 +318,8 @@ bool is_complex_type(NumericTypeID type) {
     case NumericTypeID::kCF16: return true;
     case NumericTypeID::kCF32: return true;
     case NumericTypeID::kCF64: return true;
+    case NumericTypeID::kCBF16: return true;
+    case NumericTypeID::kCTF32: return true;
     default: break;
   }
   return false;
@@ -287,6 +331,8 @@ NumericTypeID get_real_type(NumericTypeID type) {
     case NumericTypeID::kCF16: return NumericTypeID::kF16;
     case NumericTypeID::kCF32: return NumericTypeID::kF32;
     case NumericTypeID::kCF64: return NumericTypeID::kF64;
+    case NumericTypeID::kCBF16: return NumericTypeID::kBF16;
+    case NumericTypeID::kCTF32: return NumericTypeID::kTF32;
     default: break;
   }
   return type;
@@ -314,6 +360,8 @@ bool is_integer_type(NumericTypeID type) {
 bool is_signed_type(NumericTypeID type) {
   switch (type) {
     case NumericTypeID::kF16: return true;
+    case NumericTypeID::kBF16: return true;
+    case NumericTypeID::kTF32: return true;
     case NumericTypeID::kF32: return true;
     case NumericTypeID::kF64: return true;
     case NumericTypeID::kS4: return true;
@@ -340,9 +388,13 @@ bool is_unsigned_integer(NumericTypeID type) {
 bool is_float_type(NumericTypeID type) {
   switch (type) {
   case NumericTypeID::kF16: return true;
+  case NumericTypeID::kBF16: return true;
+  case NumericTypeID::kTF32: return true;
   case NumericTypeID::kF32: return true;
   case NumericTypeID::kF64: return true;
   case NumericTypeID::kCF16: return true;
+  case NumericTypeID::kCBF16: return true;
+  case NumericTypeID::kCTF32: return true;
   case NumericTypeID::kCF32: return true;
   case NumericTypeID::kCF64: return true;
   default: break;
@@ -431,7 +483,7 @@ OpcodeClassID_enumerants[] = {
   {"simt", "<simt>", OpcodeClassID::kSimt},
   {"tensorop", "<tensorop>", OpcodeClassID::kTensorOp},
   {"wmmatensorop", "<wmmatensorop>", OpcodeClassID::kWmmaTensorOp},
-  {"wmma", "<wmma>", OpcodeClassID::kWmmaTensorOp}
+  {"wmma", "<wmma>", OpcodeClassID::kWmmaTensorOp},
 };
 
 /// Converts a OpcodeClassID enumerant to a string
@@ -509,6 +561,47 @@ ComplexTransform from_string<ComplexTransform>(std::string const &str) {
 }
 
 
+static struct {
+  char const *text;
+  char const *pretty;
+  SplitKMode enumerant;
+}
+SplitKMode_enumerants[] = {
+  {"serial", "<serial>", SplitKMode::kSerial},
+  {"parallel", "<parallel>", SplitKMode::kParallel},
+};
+
+/// Converts a SplitKMode enumerant to a string
+char const *to_string(SplitKMode type, bool pretty) {
+
+  for (auto const & possible : SplitKMode_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+/// Converts a SplitKMode enumerant from a string
+template <>
+SplitKMode from_string<SplitKMode>(std::string const &str) {
+
+  for (auto const & possible : SplitKMode_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return SplitKMode::kInvalid;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid.
 bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string const &str) {
@@ -570,6 +663,20 @@ bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string c
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(tmp);
   }
     break;
+  case NumericTypeID::kBF16:
+  {
+    float tmp;
+    ss >> tmp;
+    *reinterpret_cast<bfloat16_t *>(bytes.data()) = static_cast<bfloat16_t>(tmp);
+  }
+    break;
+  case NumericTypeID::kTF32:
+  {
+    float tmp;
+    ss >> tmp;
+    *reinterpret_cast<tfloat32_t *>(bytes.data()) = static_cast<tfloat32_t>(tmp);
+  }
+    break;
   case NumericTypeID::kF32:
   {
     ss >> *reinterpret_cast<float *>(bytes.data());
@@ -589,11 +696,29 @@ bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string c
     x->imag() = static_cast<half_t>(std::imag(tmp));
   }
     break;
+  case NumericTypeID::kCBF16:
+  {
+    std::complex<float> tmp;
+    ss >> tmp;
+    cutlass::complex<cutlass::bfloat16_t> *x = reinterpret_cast<cutlass::complex<bfloat16_t> *>(bytes.data());
+    x->real() = static_cast<bfloat16_t>(std::real(tmp));
+    x->imag() = static_cast<bfloat16_t>(std::imag(tmp));
+  }
+    break;
   case NumericTypeID::kCF32:
   {
     ss >> *reinterpret_cast<std::complex<float>*>(bytes.data());
   }
     break;
+  case NumericTypeID::kCTF32:
+  {
+    std::complex<float> tmp;
+    ss >> tmp;
+    cutlass::complex<cutlass::tfloat32_t> *x = reinterpret_cast<cutlass::complex<tfloat32_t> *>(bytes.data());
+    x->real() = static_cast<tfloat32_t>(std::real(tmp));
+    x->imag() = static_cast<tfloat32_t>(std::imag(tmp));
+  }
+    break;
   case NumericTypeID::kCF64:
   {
     ss >> *reinterpret_cast<std::complex<double>*>(bytes.data());
@@ -674,6 +799,18 @@ std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type) {
     ss << tmp;
   }
     break;
+  case NumericTypeID::kBF16:
+  {
+    float tmp = *reinterpret_cast<bfloat16_t *>(bytes.data());;
+    ss << tmp;
+  }
+    break;
+  case NumericTypeID::kTF32:
+  {
+    float tmp = *reinterpret_cast<tfloat32_t *>(bytes.data());;
+    ss << tmp;
+  }
+    break;
   case NumericTypeID::kF32:
   {
     ss << *reinterpret_cast<float *>(bytes.data());
@@ -696,6 +833,18 @@ std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type) {
     }
   }
     break;
+  case NumericTypeID::kCBF16:
+  {
+    cutlass::complex<bfloat16_t> const *x = 
+      reinterpret_cast<cutlass::complex<bfloat16_t> const *>(bytes.data());
+
+    ss << float(x->real());
+
+    if (x->imag() != cutlass::bfloat16_t()) {
+      ss << "+i" << float(x->imag());
+    }
+  }
+    break;
   case NumericTypeID::kCF32:
   {
     cutlass::complex<float> const * x = reinterpret_cast<cutlass::complex<float> const *>(bytes.data());
@@ -707,6 +856,17 @@ std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type) {
     }
   }
     break;
+  case NumericTypeID::kCTF32:
+  {
+    cutlass::complex<tfloat32_t> const * x = reinterpret_cast<cutlass::complex<tfloat32_t> const *>(bytes.data());
+
+    ss << float(x->real());
+
+    if (x->imag() != tfloat32_t()) {
+      ss << "+i" << float(x->imag());
+    }
+  }
+    break;
   case NumericTypeID::kCF64:
   {
     cutlass::complex<double> const * x = reinterpret_cast<cutlass::complex<double> const *>(bytes.data());
@@ -780,6 +940,16 @@ bool cast_from_int64(std::vector<uint8_t> &bytes, NumericTypeID type, int64_t sr
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(float(src));
   }
     break;
+  case NumericTypeID::kBF16:
+  {
+    *reinterpret_cast<bfloat16_t *>(bytes.data()) = static_cast<bfloat16_t>(float(src));
+  }
+    break;
+  case NumericTypeID::kTF32:
+  {
+    *reinterpret_cast<tfloat32_t *>(bytes.data()) = static_cast<tfloat32_t>(float(src));
+  }
+    break;
   case NumericTypeID::kF32:
   {
     *reinterpret_cast<float *>(bytes.data()) = static_cast<float>(src);
@@ -870,6 +1040,16 @@ bool cast_from_uint64(std::vector<uint8_t> &bytes, NumericTypeID type, uint64_t
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(float(src));
   }
     break;
+  case NumericTypeID::kBF16:
+  {
+    *reinterpret_cast<bfloat16_t *>(bytes.data()) = static_cast<bfloat16_t>(float(src));
+  }
+    break;
+  case NumericTypeID::kTF32:
+  {
+    *reinterpret_cast<tfloat32_t *>(bytes.data()) = static_cast<tfloat32_t>(float(src));
+  }
+    break;
   case NumericTypeID::kF32:
   {
     *reinterpret_cast<float *>(bytes.data()) = static_cast<float>(src);
@@ -961,6 +1141,16 @@ bool cast_from_double(std::vector<uint8_t> &bytes, NumericTypeID type, double sr
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(float(src));
   }
     break;
+  case NumericTypeID::kBF16:
+  {
+    *reinterpret_cast<bfloat16_t *>(bytes.data()) = static_cast<bfloat16_t>(float(src));
+  }
+    break;
+  case NumericTypeID::kTF32:
+  {
+    *reinterpret_cast<tfloat32_t *>(bytes.data()) = static_cast<tfloat32_t>(float(src));
+  }
+    break;
   case NumericTypeID::kF32:
   {
     *reinterpret_cast<float *>(bytes.data()) = static_cast<float>(src);
@@ -978,11 +1168,23 @@ bool cast_from_double(std::vector<uint8_t> &bytes, NumericTypeID type, double sr
     x->imag() = static_cast<half_t>(float(0));
   }
     break;
+  case NumericTypeID::kCBF16:
+  {
+    cutlass::complex<cutlass::bfloat16_t> *x = reinterpret_cast<cutlass::complex<bfloat16_t> *>(bytes.data());
+    x->real() = static_cast<bfloat16_t>(bfloat16_t(src));
+    x->imag() = static_cast<bfloat16_t>(bfloat16_t(0));
+  }
+    break;
   case NumericTypeID::kCF32:
   {
     *reinterpret_cast<std::complex<float>*>(bytes.data()) = std::complex<float>(float(src), float(0));
   }
     break;
+  case NumericTypeID::kCTF32:
+  {
+    *reinterpret_cast<std::complex<tfloat32_t>*>(bytes.data()) = std::complex<tfloat32_t>(tfloat32_t(src), tfloat32_t(0));
+  }
+    break;
   case NumericTypeID::kCF64:
   {
     *reinterpret_cast<std::complex<double>*>(bytes.data()) = std::complex<double>(src, double(0));
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index 6e822c68..a47c8314 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/tools/profiler/src/cublas_helpers.cpp b/tools/profiler/src/cublas_helpers.cpp
index 5e5e2cb0..05262a22 100644
--- a/tools/profiler/src/cublas_helpers.cpp
+++ b/tools/profiler/src/cublas_helpers.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -52,15 +52,35 @@ Status get_cutlass_status(cublasStatus_t cublas) {
 }
 
 /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation
-cublasOperation_t get_cublas_transpose_operation(library::LayoutTypeID layout) {
+bool get_cublas_transpose_operation(
+  cublasOperation_t &operation,
+  library::LayoutTypeID layout, 
+  library::ComplexTransform transform) {
+
   switch (layout) {
     case library::LayoutTypeID::kColumnMajor:
-      return CUBLAS_OP_N;
+      if (transform == library::ComplexTransform::kNone) {
+        operation = CUBLAS_OP_N;
+        return true;
+      }
+      else {
+        return false;
+      }
+      break;
     case library::LayoutTypeID::kRowMajor:
-      return CUBLAS_OP_T;
+      if (transform == library::ComplexTransform::kNone) {
+        operation = CUBLAS_OP_T;
+        return true;
+      }
+      else if (transform == library::ComplexTransform::kConjugate) {
+        operation = CUBLAS_OP_C;
+        return true;
+      }
+      break;
     default: break;
   }
-  throw std::runtime_error("CUTLASS layout type does not correspond to cublas type");
+
+  return false;
 }
 
 /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration
@@ -114,6 +134,14 @@ bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID ele
 
   case library::NumericTypeID::kB1: 
     break;
+
+  case library::NumericTypeID::kCF32:
+    data_type = CUDA_C_32F;
+    return true;
+
+  case library::NumericTypeID::kCF64:
+    data_type = CUDA_C_64F;
+    return true;
   
   case library::NumericTypeID::kInvalid:
   
@@ -157,6 +185,104 @@ Status cublas_satisfies(library::GemmDescription const &desc) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace detail {
+
+cublasGemmExDispatcher::cublasGemmExDispatcher(
+  library::GemmDescription const &op_desc,
+  library::GemmUniversalConfiguration configuration_,
+  library::GemmUniversalArguments arguments_,
+  cublasGemmAlgo_t algorithm
+):
+  configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) {
+
+  bool good = true;
+
+  good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
+  good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B));
+  good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
+  good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
+  good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
+
+  good = (good && get_cublas_datatype(
+    compute_data_type,
+    op_desc.tile_description.math_instruction.element_accumulator));
+
+  // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
+  // internal numerical data types used in the computation.
+#if (__CUDA_VER_MAJOR__ >= 11)
+  library::OpcodeClassID const & opcode_class =
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  if (good &&
+    op_desc.A.element == library::NumericTypeID::kF32 &&
+    op_desc.B.element == library::NumericTypeID::kF32 &&
+    opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  else if (good) {
+    bool const isPedantic = false;
+    switch (compute_data_type) {
+      case CUDA_R_32F:
+      case CUDA_C_32F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+        break;
+      case CUDA_R_64F:
+      case CUDA_C_64F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+        break;
+      case CUDA_R_16F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+        break;
+      case CUDA_R_32I:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+        break;
+      default:
+        good = false;
+        break;
+    }
+  }
+#endif // __CUDA_VER_MAJOR__ >= 11
+
+  if (!good) {
+    status = Status::kErrorNotSupported;
+  }
+}
+
+/// Executes GEMM using these arguments
+cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
+
+  return cublasGemmEx(
+    handle,
+    trans_A,
+    trans_B,
+    configuration.problem_size.m(),
+    configuration.problem_size.n(),
+    configuration.problem_size.k(),
+    arguments.alpha,
+    arguments.A,
+    data_type_A,
+    int(configuration.lda),
+    arguments.B,
+    data_type_B,
+    int(configuration.ldb),
+    arguments.beta,
+    arguments.D,
+    data_type_C,
+    int(configuration.ldc),
+#if (__CUDA_VER_MAJOR__ >= 11)
+    compute_type,
+#else
+    compute_data_type,
+#endif
+    algo
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+
 } // namespace profiler
 } // namespace cutlass
 
diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h
index 0ade0961..9c807846 100644
--- a/tools/profiler/src/cublas_helpers.h
+++ b/tools/profiler/src/cublas_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -33,7 +33,10 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+
 #include "options.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -45,7 +48,10 @@ namespace profiler {
 Status get_cutlass_status(cublasStatus_t cublas);
 
 /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation
-cublasOperation_t get_cublas_transpose_operation(library::LayoutTypeID layout);
+bool get_cublas_transpose_operation(
+  cublasOperation_t &operation,
+  library::LayoutTypeID layout,
+  library::ComplexTransform transform = library::ComplexTransform::kNone);
 
 /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration
 bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID element_type);
@@ -168,8 +174,8 @@ struct cublasGemmExDispatcher {
   //
   // Data members
   //
-  library::GemmConfiguration configuration;
-  library::GemmArguments arguments;
+  library::GemmUniversalConfiguration configuration;
+  library::GemmUniversalArguments arguments;
 
   // cublass-specific data structures to fill cublas API call arguments
   cublasOperation_t trans_A;
@@ -177,7 +183,12 @@ struct cublasGemmExDispatcher {
   cudaDataType_t data_type_A;
   cudaDataType_t data_type_B;
   cudaDataType_t data_type_C;
-  cudaDataType_t compute_type;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDA_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
   cublasGemmAlgo_t algo;
   Status status;
   
@@ -187,54 +198,13 @@ struct cublasGemmExDispatcher {
 
   cublasGemmExDispatcher( 
     library::GemmDescription const &op_desc,
-    library::GemmConfiguration configuration_,
-    library::GemmArguments arguments_,
+    library::GemmUniversalConfiguration configuration_,
+    library::GemmUniversalArguments arguments_,
     cublasGemmAlgo_t algorithm = CUBLAS_GEMM_DFALT
-  ):
-    configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) {
-
-    trans_A = get_cublas_transpose_operation(op_desc.A.layout);
-    trans_B = get_cublas_transpose_operation(op_desc.B.layout);
-
-    bool good = true;
-    good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
-    good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
-    good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
-
-    good = (good && get_cublas_datatype(
-      compute_type, 
-      op_desc.tile_description.math_instruction.element_accumulator));
-
-    if (!good) {
-      status = Status::kErrorNotSupported;
-    }
-  }
+  );
 
   /// Executes GEMM using these arguments
-  cublasStatus_t operator()(cublasHandle_t handle) {
-
-    return cublasGemmEx(
-      handle,
-      trans_A,
-      trans_B,
-      configuration.problem_size.m(),
-      configuration.problem_size.n(),
-      configuration.problem_size.k(),
-      arguments.alpha,
-      arguments.A,
-      data_type_A,
-      int(configuration.lda),
-      arguments.B,
-      data_type_B,
-      int(configuration.ldb),
-      arguments.beta,
-      arguments.D,
-      data_type_C,
-      int(configuration.ldc),
-      compute_type,
-      algo
-    );
-  }
+  cublasStatus_t operator()(cublasHandle_t handle);
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index b36f897b..90f4a959 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -44,7 +44,7 @@ CutlassProfiler::CutlassProfiler(
 ): 
   options_(options) {
 
-  operation_profilers_.emplace_back(new GemmOperationProfiler);
+  operation_profilers_.emplace_back(new GemmOperationProfiler(options));
 
 }
 
@@ -108,13 +108,6 @@ void CutlassProfiler::enumerate_() {
 /// Profiles all operations
 int CutlassProfiler::profile_() {
 
-  library::Manifest manifest(library::Provider::kCUTLASS);
-  Status status = manifest.initialize();
-
-  if (status != Status::kSuccess) {
-    return -1;
-  }
-
   int result = 0;
   DeviceContext device_context;
 
@@ -124,7 +117,7 @@ int CutlassProfiler::profile_() {
     if (options_.operation_kind == library::OperationKind::kInvalid ||
       options_.operation_kind == profiler->kind()) {
 
-      result = profiler->profile_all(options_, manifest, device_context);
+      result = profiler->profile_all(options_, library::Singleton::get().manifest, device_context);
 
       if (result) {
         return result;
diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/src/cutlass_profiler.h
index eda24c5b..d3b592a4 100644
--- a/tools/profiler/src/cutlass_profiler.h
+++ b/tools/profiler/src/cutlass_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -30,6 +30,7 @@
 // CUTLASS Library includes
 #include "cutlass/library/library.h"
 #include "cutlass/library/manifest.h"
+#include "cutlass/library/singleton.h"
 
 #include "options.h"
 #include "operation_profiler.h"
diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h
index 8aad2ee9..aed11ca1 100644
--- a/tools/profiler/src/debug.h
+++ b/tools/profiler/src/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index c97f0de4..4045abfe 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -431,6 +431,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
       dist
     );
     break;
+  case library::NumericTypeID::kCF32:
+    cutlass::reference::device::BlockFillRandom<cutlass::complex<float>>(
+      reinterpret_cast<cutlass::complex<float> *>(pointer_),
+      capacity_,
+      seed,
+      dist
+    );
+    break;
   case library::NumericTypeID::kF64:
     cutlass::reference::device::BlockFillRandom<double>(
       reinterpret_cast<double *>(pointer_),
@@ -548,6 +556,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
       dist
     );
     break;
+  case library::NumericTypeID::kCF32:
+    cutlass::reference::host::BlockFillRandom<cutlass::complex<float>>(
+      reinterpret_cast<cutlass::complex<float> *>(host_data.data()),
+      capacity_,
+      seed,
+      dist
+    );
+    break;
   case library::NumericTypeID::kF64:
     cutlass::reference::host::BlockFillRandom<double>(
       reinterpret_cast<double *>(host_data.data()),
@@ -655,6 +671,12 @@ bool DeviceAllocation::block_compare_equal(
       reinterpret_cast<float const *>(ptr_A), 
       reinterpret_cast<float const *>(ptr_B), 
       capacity);
+
+  case library::NumericTypeID::kCF32:
+    return reference::device::BlockCompareEqual<cutlass::complex<float> >(
+      reinterpret_cast<complex<float> const *>(ptr_A), 
+      reinterpret_cast<complex<float> const *>(ptr_B), 
+      capacity);
   
   case library::NumericTypeID::kCF16:
     return reference::device::BlockCompareEqual<complex<half_t>>(
@@ -825,6 +847,23 @@ bool DeviceAllocation::block_compare_relatively_equal(
       static_cast<uint64_t>(epsilon), 
       static_cast<uint64_t>(nonzero_floor));
 
+  // No relatively equal comparison for complex numbers.
+  //
+  // As a simplification, we can require bitwise equality. This avoids false positives.
+  // (i.e. "pass" really means passing. "Fail" may not actually mean failure given appropriate epsilon.)
+  //
+  case library::NumericTypeID::kCF32:
+    return reference::device::BlockCompareEqual<cutlass::complex<float> >(
+      reinterpret_cast<complex<float> const *>(ptr_A),
+      reinterpret_cast<complex<float> const *>(ptr_B),
+      capacity);
+  
+  case library::NumericTypeID::kCF64:
+    return reference::device::BlockCompareEqual<cutlass::complex<double> >(
+      reinterpret_cast<complex<double> const *>(ptr_A),
+      reinterpret_cast<complex<double> const *>(ptr_B),
+      capacity);
+
   default:
     throw std::runtime_error("Unsupported numeric type");
   }
@@ -970,6 +1009,14 @@ void DeviceAllocation::write_tensor_csv(
   case library::NumericTypeID::kU64:
     write_tensor_csv_static_type<uint64_t>(out, *this);
     break;
+  
+  case library::NumericTypeID::kCF32:
+    write_tensor_csv_static_type<cutlass::complex<float> >(out, *this);
+    break;
+
+  case library::NumericTypeID::kCF64:
+    write_tensor_csv_static_type<cutlass::complex<double> >(out, *this);
+    break;
 
   default:
     throw std::runtime_error("Unsupported numeric type");
diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h
index be69f037..f57cda14 100644
--- a/tools/profiler/src/device_allocation.h
+++ b/tools/profiler/src/device_allocation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu
index 780e0447..f9cfe9ab 100644
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/device_context.h b/tools/profiler/src/device_context.h
index 7be0349a..aea872ef 100644
--- a/tools/profiler/src/device_context.h
+++ b/tools/profiler/src/device_context.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp
index 1acefb1f..29be6f8b 100644
--- a/tools/profiler/src/enumerated_types.cpp
+++ b/tools/profiler/src/enumerated_types.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/src/enumerated_types.h
index 051406d1..e7e713bd 100644
--- a/tools/profiler/src/enumerated_types.h
+++ b/tools/profiler/src/enumerated_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -50,7 +50,7 @@ T from_string(std::string const &);
 enum class ExecutionMode {
   kProfile,     ///< regular verification and profiling
   kDryRun,      ///< no kernels are launched or workspaces allocated; used to assess what operators might be launched
-  kEnumerate,   ///< no kernels launched or workspaces allocated; lists all function types and functions
+  kEnumerate,   ///< no kernels launched or workspaces allocated; lists all operation kind and operations
   kTrace,       ///< executes a single device-side computation with no other kernel launches
   kInvalid
 };
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index cb430968..f494eeee 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -31,6 +31,8 @@
 #include <iomanip>
 #include <ios>
 
+#include "cutlass/core_io.h"
+
 #include "cublas_helpers.h"
 #include "gemm_operation_profiler.h"
 #include "gpu_timer.h"
@@ -44,22 +46,27 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-GemmOperationProfiler::GemmOperationProfiler(): 
-  OperationProfiler(library::OperationKind::kGemm,{
-  	{ArgumentTypeID::kEnumerated, {"Gemm_kind"}, "Variant of GEMM (e.g. gemm, planar complex, batched, ...)"},
-	  {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
-  	{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
-	  {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
-  	{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
-	  {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
-  	{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
-	  {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
-  	{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
-	  {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
-  	{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
-  }) {
+GemmOperationProfiler::GemmOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kGemm,
+    {
+      {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (gemm, batched, array, universal, planar_complex, planar_complex_array)"},
+      {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"},
+    },
+    { library::Provider::kCUBLAS}
+  ) {
 
-  description_ = "General matrix-matrix product. D = alpha * A*B + beta * C";
+  description_ = "      General matrix-matrix product. D = alpha * A*B + beta * C";
 }
 
 /// Destructor
@@ -107,6 +114,8 @@ void GemmOperationProfiler::print_examples(std::ostream &out) const {
     << "   --providers=cutlass --output=functional-test.csv\n\n";
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 #if 0
 // used this for debugging
 static std::string byte_string(std::vector<uint8_t> const &bytes) {
@@ -122,47 +131,34 @@ static std::string byte_string(std::vector<uint8_t> const &bytes) {
 }
 #endif
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Extracts the problem dimensions
-Status GemmOperationProfiler::initialize_configuration(
-  Options const &options,  
-  PerformanceReport &report,
-  DeviceContext &device_context,
-  library::Operation const *operation,
+Status GemmOperationProfiler::GemmProblem::parse(
+  library::GemmDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-
-  library::GemmDescription const &operation_desc = 
-    static_cast<library::GemmDescription const &>(operation->description());
-
-  if (operation_desc.gemm_kind != library::GemmKind::kGemm) {
-    return Status::kErrorInvalidProblem;
+  
+  if (!arg_as_int(this->m, "m", problem_space, problem)) {
+    // default value
+    this->m = 1024;
   }
 
-  if (!arg_as_int(problem_.m, "m", problem_space, problem)) {
+  if (!arg_as_int(this->n, "n", problem_space, problem)) {
     // default value
-    problem_.m = 1024;
-  }
-
-  if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
-    // default value
-    problem_.n = 1024;
+    this->n = 1024;
   }
   
-  if (!arg_as_int(problem_.k, "k", problem_space, problem)) {
+  if (!arg_as_int(this->k, "k", problem_space, problem)) {
     // default value
-    problem_.k = 1024;
+    this->k = 1024;
   }
   
-  if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) {
+  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
     // default value
-    problem_.split_k_slices = 1;
+    this->split_k_slices = 1;
   }
   
-  if (!arg_as_int(problem_.batch_count, "batch_count", problem_space, problem)) {
+  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
-    problem_.batch_count = 1;
+    this->batch_count = 1;
   }
 
   if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
@@ -178,37 +174,97 @@ Status GemmOperationProfiler::initialize_configuration(
   }
 
   if (!arg_as_scalar(
-    problem_.alpha, 
+    this->alpha, 
     operation_desc.element_epilogue, 
     "alpha", 
     problem_space, 
     problem)) {
 
-    if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
+    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
   
   if (!arg_as_scalar(
-    problem_.beta, 
+    this->beta, 
     operation_desc.element_epilogue, 
     "beta", 
     problem_space, 
     problem)) {
     
-    if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
+    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
   }
   
-  problem_.lda = DeviceAllocation::get_packed_layout(
-    operation_desc.A.layout, {int(problem_.m), int(problem_.k)}).front();
+  this->lda = DeviceAllocation::get_packed_layout(
+    operation_desc.A.layout, {int(this->m), int(this->k)}).front();
 
-  problem_.ldb = DeviceAllocation::get_packed_layout(
-    operation_desc.B.layout, {int(problem_.k), int(problem_.n)}).front();
+  this->ldb = DeviceAllocation::get_packed_layout(
+    operation_desc.B.layout, {int(this->k), int(this->n)}).front();
 
-  problem_.ldc = DeviceAllocation::get_packed_layout(
-    operation_desc.C.layout, {int(problem_.m), int(problem_.n)}).front();
+  this->ldc = DeviceAllocation::get_packed_layout(
+    operation_desc.C.layout, {int(this->m), int(this->n)}).front();
+
+  return Status::kSuccess;
+}
+
+/// Initializes a performance result
+void GemmOperationProfiler::GemmProblem::initialize_result(
+  PerformanceResult &result,
+  library::GemmDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "gemm_kind", problem_space, library::to_string(operation_desc.gemm_kind));
+
+  set_argument(result, "A", problem_space,
+    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
+
+  set_argument(result, "B", problem_space,
+    std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout));
+
+  set_argument(result, "C", problem_space,
+    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
+
+  set_argument(result, "m", problem_space, m);
+  set_argument(result, "n", problem_space, n);
+  set_argument(result, "k", problem_space, k);
+
+  set_argument(result, "split_k_slices", problem_space, split_k_slices);
+  set_argument(result, "batch_count", problem_space, batch_count);
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(beta, operation_desc.element_epilogue));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status GemmOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::GemmDescription const &operation_desc = 
+    static_cast<library::GemmDescription const &>(operation->description());
+
+  if (operation_desc.gemm_kind != library::GemmKind::kUniversal) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  Status status = problem_.parse(operation_desc, problem_space, problem);
+  
+  if (status != Status::kSuccess) {
+    return status;
+  }
 
   gemm_workspace_.configuration.problem_size.m() = int(problem_.m);
   gemm_workspace_.configuration.problem_size.n() = int(problem_.n);
@@ -217,7 +273,8 @@ Status GemmOperationProfiler::initialize_configuration(
   gemm_workspace_.configuration.ldb = problem_.ldb;
   gemm_workspace_.configuration.ldc = problem_.ldc;
   gemm_workspace_.configuration.ldd = problem_.ldc;
-  gemm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices);
+  //gemm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices);
+  gemm_workspace_.configuration.batch_count = int(problem_.split_k_slices);
 
   gemm_workspace_.arguments.A = nullptr;
   gemm_workspace_.arguments.B = nullptr;
@@ -243,37 +300,24 @@ void GemmOperationProfiler::initialize_result_(
   result.disposition = Disposition::kNotRun;
   result.status = Status::kSuccess;
   result.operation_name = operation_desc.name;
-
-  result.arguments.resize(problem_space.rank());
-
-  set_argument_(result, "A", problem_space,
-    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
-
-  set_argument_(result, "B", problem_space,
-    std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout));
-
-  set_argument_(result, "C", problem_space,
-    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
-
-  set_argument_(result, "m", problem_space, problem_.m);
-  set_argument_(result, "n", problem_space, problem_.n);
-  set_argument_(result, "k", problem_space, problem_.k);
-
-  set_argument_(result, "split_k_slices", problem_space, problem_.split_k_slices);
-  set_argument_(result, "batch_count", problem_space, problem_.batch_count);
-
-  set_argument_(result, "alpha", problem_space,
-    library::lexical_cast(problem_.alpha, operation_desc.element_epilogue));
-
-  set_argument_(result, "beta", problem_space,
-    library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
+  
+  problem_.initialize_result(result, operation_desc, problem_space);
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
 
+  // Input bytes read and Output bytes written for the gemm problem
   result.bytes = 
     int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * problem_.k + 
     int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.n / 8) * problem_.k +
-    int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n * 2;
+    int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n;
+
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(problem_.beta.begin(), problem_.beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    result.bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n;
+  }
 
   result.flops = 2 * (problem_.m * problem_.n * problem_.k + problem_.m * problem_.n);
   result.runtime = 0;
@@ -378,8 +422,9 @@ Status GemmOperationProfiler::initialize_workspace(
     results_.back().provider = library::Provider::kCUTLASS;
     results_.back().op_kind = library::OperationKind::kGemm;
     results_.back().disposition = Disposition::kNotRun;
-    for(auto &verification_provider : options.verification.providers) {
-      results_.back().verification_map[verification_provider] = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
     }
   }
 
@@ -559,8 +604,7 @@ bool GemmOperationProfiler::verify_with_cublas_(
     );
 
     if (gemm_op.status != Status::kSuccess) {
-
-      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
       return true;
     }
 
diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/src/gemm_operation_profiler.h
index 3bd0bb62..e4d23212 100644
--- a/tools/profiler/src/gemm_operation_profiler.h
+++ b/tools/profiler/src/gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -31,6 +31,7 @@
 #include <vector>
 #include <string>
 #include <memory>
+#include <algorithm>
 #include <unordered_map>
 
 // CUTLASS Library includes
@@ -75,6 +76,18 @@ public:
 
     GemmProblem(): 
       m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::GemmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::GemmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
   };
 
   /// Workspace used 
@@ -86,8 +99,8 @@ public:
     DeviceAllocation *Computed;
     DeviceAllocation *Reference;
 
-    library::GemmConfiguration configuration;
-    library::GemmArguments arguments;
+    library::GemmUniversalConfiguration configuration;
+    library::GemmUniversalArguments arguments;
 
     /// Buffer used for the operation's host workspace
     std::vector<uint8_t> host_workspace;
@@ -122,7 +135,7 @@ public:
   //
 
   /// Ctor
-  GemmOperationProfiler();
+  GemmOperationProfiler(Options const &options);
 
   /// Destructor
   virtual ~GemmOperationProfiler();
diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp
index 218e09d3..eb3a8411 100644
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h
index ca00ad7a..5cd4b003 100644
--- a/tools/profiler/src/gpu_timer.h
+++ b/tools/profiler/src/gpu_timer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp
index a76fcf9a..a1e52311 100644
--- a/tools/profiler/src/main.cpp
+++ b/tools/profiler/src/main.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index 6d21f87e..754118a7 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -31,6 +31,7 @@
 #include <iomanip>
 #include <cstring>
 #include <fstream>
+#include <sstream>
 
 #ifdef __unix__
 #include <unistd.h>
@@ -55,30 +56,41 @@ OperationProfiler::OperationProfiler(): kind_(library::OperationKind::kInvalid)
 
 /// Ctor
 OperationProfiler::OperationProfiler(
+  Options const &options,
   library::OperationKind kind,
   ArgumentDescriptionVector const &arguments,
-  ProviderVector const & reference_providers
+  ProviderVector const & verification_providers
 ): 
-  kind_(kind), arguments_(arguments), reference_providers_(reference_providers) {
+  kind_(kind), arguments_(arguments) {
 
   ArgumentDescriptionVector tile_description_arguments{
-    {ArgumentTypeID::kEnumerated, {"op_class", "opcode-class"}, "Class of math instruction (SIMT or TensorOp)."},
-    {ArgumentTypeID::kEnumerated, {"accum", "accumulator-type"}, "Math instruction accumulator data type."},
-    {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension."},
-    {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension."},
-    {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension."},
-    {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply."},
-    {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension."},
-    {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension."},
-    {ArgumentTypeID::kInteger, {"warps_k", "warp-count::k"}, "Number of warps within threadblock along the K dimension."},
-    {ArgumentTypeID::kInteger, {"inst_m", "instruction-shape::m"}, "Math instruction shape in the M dimension."},
-    {ArgumentTypeID::kInteger, {"inst_n", "instruction-shape::n"}, "Math instruction shape in the N dimension."},
-    {ArgumentTypeID::kInteger, {"inst_k", "instruction-shape::k"}, "Math instruction shape in the K dimension."},
-    {ArgumentTypeID::kInteger, {"min_cc", "minimum-compute-capability"}, "Minimum device compute capability."},
-    {ArgumentTypeID::kInteger, {"max_cc", "maximum-compute-capability"}, "Maximum device compute capability."}
+    {ArgumentTypeID::kEnumerated, {"op_class", "opcode-class"}, "Class of math instruction (simt, tensorop, wmmatensorop, wmma)"},
+    {ArgumentTypeID::kEnumerated, {"accum", "accumulator-type"}, "Math instruction accumulator data type"},
+    {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension"},
+    {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension"},
+    {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension"},
+    {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply"},
+    {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension"},
+    {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension"},
+    {ArgumentTypeID::kInteger, {"warps_k", "warp-count::k"}, "Number of warps within threadblock along the K dimension"},
+    {ArgumentTypeID::kInteger, {"inst_m", "instruction-shape::m"}, "Math instruction shape in the M dimension"},
+    {ArgumentTypeID::kInteger, {"inst_n", "instruction-shape::n"}, "Math instruction shape in the N dimension"},
+    {ArgumentTypeID::kInteger, {"inst_k", "instruction-shape::k"}, "Math instruction shape in the K dimension"},
+    {ArgumentTypeID::kInteger, {"min_cc", "minimum-compute-capability"}, "Minimum device compute capability"},
+    {ArgumentTypeID::kInteger, {"max_cc", "maximum-compute-capability"}, "Maximum device compute capability"}
   };
 
   arguments_.insert(arguments_.end(), tile_description_arguments.begin(), tile_description_arguments.end());
+
+  for (auto provider : verification_providers) {
+    if (std::find(
+      options.verification.providers.begin(), 
+      options.verification.providers.end(), 
+      provider) != options.verification.providers.end()) {
+
+      verification_providers_.push_back(provider);
+    }
+  }
 }
 
 /// Destructor
@@ -248,8 +260,9 @@ int OperationProfiler::profile_all(
       auto min_cc = operation->description().tile_description.minimum_compute_capability;
       auto max_cc = operation->description().tile_description.maximum_compute_capability;
 
-      // Execute compatible operations if they satisfy the current device's compute capability
+      // Execute compatible cutlass operations if they satisfy the current device's compute capability
       if (operation->description().kind == kind_ &&
+        operation->description().provider == library::Provider::kCUTLASS &&
         options.device.compute_capability() >= min_cc &&
           options.device.compute_capability() <= max_cc) {
 
@@ -259,7 +272,7 @@ int OperationProfiler::profile_all(
         if (!filtered_by_name) {
           
           for (auto const & op_name : options.operation_names) {
-            if (operation_name.find(op_name) !=std::string::npos) {
+            if (find_string_matches_(op_name, operation_name)) {
               filtered_by_name = true;
               break;
             }
@@ -278,7 +291,7 @@ int OperationProfiler::profile_all(
           operation,
           problem_space,
           problem);
-        
+
         if (status == Status::kErrorInternal) {
           // Stop profiling if there was an internal error
           return false;
@@ -548,29 +561,28 @@ void OperationProfiler::initialize_result_(
   library::OperationDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
-  set_argument_(result, "op_class", problem_space,
+  set_argument(result, "op_class", problem_space,
     library::to_string(operation_desc.tile_description.math_instruction.opcode_class));
 
-  set_argument_(result, "accum", problem_space,
+  set_argument(result, "accum", problem_space,
     library::to_string(operation_desc.tile_description.math_instruction.element_accumulator));
 
-  set_argument_(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m());
-  set_argument_(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n());
-  set_argument_(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k());
-  set_argument_(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages);
-  set_argument_(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m());
-  set_argument_(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n());
-  set_argument_(result, "warps_k", problem_space, operation_desc.tile_description.warp_count.k());
-  set_argument_(result, "inst_m", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.m());
-  set_argument_(result, "inst_n", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.n());
-  set_argument_(result, "inst_k", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.k());
-  set_argument_(result, "min_cc", problem_space, operation_desc.tile_description.minimum_compute_capability);
-  set_argument_(result, "max_cc", problem_space, operation_desc.tile_description.maximum_compute_capability);
+  set_argument(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m());
+  set_argument(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n());
+  set_argument(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k());
+  set_argument(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages);
+  set_argument(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m());
+  set_argument(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n());
+  set_argument(result, "warps_k", problem_space, operation_desc.tile_description.warp_count.k());
+  set_argument(result, "inst_m", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.m());
+  set_argument(result, "inst_n", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.n());
+  set_argument(result, "inst_k", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.k());
+  set_argument(result, "min_cc", problem_space, operation_desc.tile_description.minimum_compute_capability);
+  set_argument(result, "max_cc", problem_space, operation_desc.tile_description.maximum_compute_capability);
 }
 
-
 /// Helper
-void OperationProfiler::set_argument_(
+void OperationProfiler::set_argument(
   PerformanceResult &result,
   char const *name,
   ProblemSpace const &problem_space,
@@ -579,7 +591,7 @@ void OperationProfiler::set_argument_(
   result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), value);
 }
 
-void OperationProfiler::set_argument_(  
+void OperationProfiler::set_argument(  
   PerformanceResult &result,
   char const *name,
   ProblemSpace const &problem_space,
@@ -588,6 +600,39 @@ void OperationProfiler::set_argument_(
   result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), library::lexical_cast(value));
 }
 
+
+/// finds string matches filter_string in operation_name
+bool OperationProfiler::find_string_matches_(
+  std::string const &filter_string, 
+  std::string const &operation_name) {
+  // Returns true if all substrings appear in the operation_name in order
+  
+  // Split filter_string of the format "gemm*f32*nt" to tokens ["gemm", "f32", "nt"]
+  std::string item;  
+  std::istringstream iss(filter_string);
+  std::vector<std::string> filter_tokens;
+  while (std::getline(iss, item, '*')) {
+    filter_tokens.push_back(item);
+  }
+
+  // Search filter_tokens in operation_name in order
+  size_t start = 0, idx = 0;
+  for(auto & token : filter_tokens) {
+    // Check if characters left to be parsed in operation_name
+    if (start < operation_name.length()) {
+      // Find token in operation_name[start:]
+      idx = operation_name.substr(start).find(token);
+      if (idx == std::string::npos) {
+        return false;
+      }
+    }
+    start += (idx + token.length()); 
+  }
+
+  // All tokens in filter_string found in operation_name
+  return true;
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace profiler
diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h
index ce06b1c9..c7e20f36 100644
--- a/tools/profiler/src/operation_profiler.h
+++ b/tools/profiler/src/operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -73,7 +73,7 @@ protected:
   ArgumentDescriptionVector arguments_;
 
   /// List of providers used to verify and compare each result
-  ProviderVector reference_providers_;
+  ProviderVector verification_providers_;
 
   /// Model performance result initailized by the operation profiler with workload statistics
   /// and reasonable default state.
@@ -92,9 +92,10 @@ public:
   OperationProfiler();
 
   OperationProfiler(
+    Options const &options,
     library::OperationKind kind, 
     ArgumentDescriptionVector const &arguments = ArgumentDescriptionVector(),
-    ProviderVector const & reference_providers = ProviderVector());
+    ProviderVector const & verification_providers = ProviderVector());
 
   /// Destructor
   virtual ~OperationProfiler();
@@ -196,6 +197,20 @@ public:
     library::OperationDescription const &desc,
     library::Provider provider,
     library::Provider verification_provider = library::Provider::kInvalid);
+  
+  /// Helper to set a performance result member
+  static void set_argument(  
+    PerformanceResult &result,
+    char const *name,
+    ProblemSpace const &problem_space,
+    std::string const &value);
+
+  /// Helper to set a performance result member
+  static void set_argument(  
+    PerformanceResult &result,
+    char const *name,
+    ProblemSpace const &problem_space,
+    int64_t value);
 
 protected:
 
@@ -205,20 +220,6 @@ protected:
     library::OperationDescription const &operation_desc,
     ProblemSpace const &problem_space);
 
-  /// Helper to set a performance result member
-  static void set_argument_(  
-    PerformanceResult &result,
-    char const *name,
-    ProblemSpace const &problem_space,
-    std::string const &value);
-
-  /// Helper to set a performance result member
-  static void set_argument_(  
-    PerformanceResult &result,
-    char const *name,
-    ProblemSpace const &problem_space,
-    int64_t value);
-
   /// Method to profile an initialized CUTLASS operation
   virtual Status profile_cutlass_(
     double &runtime,
@@ -227,6 +228,12 @@ protected:
     void const *arguments,
     void *host_workspace,
     void *device_workspace);
+
+private:
+  /// finds string matches filter_string in operation_name
+  bool find_string_matches_(
+    std::string const &filter_string, 
+    std::string const &operation_name);
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index 946e536c..5f62a81e 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -76,7 +76,7 @@ Options::Device::Device(cutlass::CommandLine const &cmdline) {
 void Options::Device::print_usage(std::ostream &out) const {
 
   out << "Device:\n"
-    << "  --device=<int>                         "
+    << "  --device=<int>                               "
     << "    CUDA Device ID\n\n";
 
   int device_count = 0;
@@ -106,7 +106,7 @@ void Options::Device::print_usage(std::ostream &out) const {
   }
 
   out
-    << "  --compute-capability=<int>             "
+    << "  --compute-capability=<int>                   "
     << "    Override the compute capability.\n\n";
 
 }
@@ -255,12 +255,6 @@ void Options::Initialization::get_distribution(
       continue;  // next token
     }
 
-    // Casts as integer without scaling
-    if (it->first.compare("integer") == 0) {
-      dist.int_scale = 0;
-      continue;  // next token
-    }
-
     // initialize other members
     for (int m = 0; members[m].label; ++m) {
       if (it->first == members[m].label && !it->second.empty()) {
@@ -276,19 +270,23 @@ void Options::Initialization::print_usage(std::ostream &out) const {
 
   out << "Initialization:\n"
 
-    << "  --initialization=<bool>                "
+    << "  --initialization=<bool>                      "
     << "    Enables initialization (default: true). If false, device memory is" << end_of_line
-    << "not initialized after allocation.\n\n"
+    << "      not initialized after allocation.\n\n"
 
-    << "  --initialization-provider=<provider>   "
-    << "    Selects 'device' or 'host' initialization.\n\n"
+    << "  --initialization-provider=<provider>         "
+    << "    Selects initialization provider {host, device*}. (default: '*')\n\n"
 
-    << "  --dist=<distribution>                  "
-    << "    Data distribution of input tensors\n\n"
+    << "  --dist=<distribution>                        "
+    << "    Data distribution of input tensors {uniform*, gaussian, identity, sequential}"  << end_of_line
+    << "       --dist=uniform,min:<double>,max:<double>,scale:<integer>"  << end_of_line
+    << "       --dist=gaussian,mean:<double>,stddev:<double>,scale:<integer>"  << end_of_line
+    << "       --dist=sequential,start:<double>,delta:<double>,scale:<integer>"  << end_of_line
+    << "       --dist=identity\n\n"
 
-    << "  --seed=<int>                           "
+    << "  --seed=<int>                                 "
     << "    Random number generator seed. Used to enforce deterministic" << end_of_line
-    << "initialization.\n\n";
+    << "      initialization.\n\n";
 
 }
 
@@ -339,12 +337,12 @@ void Options::Library::print_usage(std::ostream &out) const {
 
   out << "Library:\n"
 
-    << "  --library-algo-mode=<mode>             "
+    << "  --library-algo-mode=<mode>                   "
     << "    Indicates algorithm mode used to call libraries such as cuBLAS and cuDNN.\n"
-    << "                                         "
+    << "                                               "
     << "    mode={default*,matching,best}\n\n"
 
-    << "  --library-algos=<range-list>           "
+    << "  --library-algos=<range-list>                 "
     << "    If --algorithm-mode=best, permits specifying a selection of algorithms.\n\n";
 
 }
@@ -393,21 +391,25 @@ void Options::Profiling::print_usage(std::ostream &out) const {
 
   out << "Profiling:\n"
 
-    << "  --profiling-iterations=<iterations>    "
+    << "  --profiling-iterations=<iterations>          "
     << "    Number of iterations to profile each kernel. If zero, kernels" << end_of_line
-    << "are launched up to the profiling duration.\n\n"
+    << "      are launched up to the profiling duration.\n\n"
 
-    << "  --warmup-iterations=<iterations>       "
+    << "  --warmup-iterations=<iterations>             "
     << "    Number of iterations to execute each kernel prior to profiling.\n\n"
 
-    << "  --sleep-duration=<duration>            "
-    << "    Number of ms to sleep between profiling periods (ms)\n\n"
+    << "  --sleep-duration=<duration>                  "
+    << "    Number of ms to sleep between profiling periods (ms).\n\n"
 
-    << "  --profiling-enabled=<bool>             "
+    << "  --profiling-enabled=<bool>                   "
     << "    If true, profiling is actually conducted.\n\n"
 
-    << "  --providers=<providers>                "
-    << "    List of providers to be profiled for performance\n\n";
+    << "  --providers=<providers>                      "
+    << "    List of providers to be profiled for performance. (default: '*')" << end_of_line
+    << "      Gemm providers {cutlass*"
+    << "}" << end_of_line
+    << "\n\n";
+
 }
 
 void Options::Profiling::print_options(std::ostream &out, int indent) const {
@@ -477,6 +479,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
   }
   else {
     providers.push_back(library::Provider::kCUBLAS);
+    providers.push_back(library::Provider::kReferenceDevice);
   }
 }
 
@@ -484,22 +487,27 @@ void Options::Verification::print_usage(std::ostream &out) const {
 
   out << "Verification:\n"
 
-    << "  --verification-enabled=<bool>          "
+    << "  --verification-enabled=<bool>                "
     << "    Whether to perform verification checks.\n\n"
 
-    << "  --epsilon=<error>                      "
+    << "  --epsilon=<error>                            "
     << "    Error threshold. Setting to zero (default) requires" << end_of_line
-    << "bit-level equivalence.\n\n"
+    << "      bit-level equivalence.\n\n"
 
-    << "  --nonzero-floor=<floor>                "
+    << "  --nonzero-floor=<floor>                      "
     << "    Results whose absolute value is less than this quantity" << end_of_line
-    << "are treated as zero for comparisons.\n\n"
+    << "      are treated as zero for comparisons.\n\n"
 
-    << "  --save-workspace={*never,incorrect,always}"
-    << " Specifies when to save the GEMM inputs and results to the filesystem.\n\n"
+    << "  --save-workspace=<string>                    "
+    << "    Specifies when to save the GEMM inputs and results to the filesystem." << end_of_line
+    << "       --save-workspace=never      never save workspace (default)" << end_of_line
+    << "       --save-workspace=incorrect  save workspace for incorrect results" << end_of_line
+    << "       --save-workspace=always     always save workspace\n\n"
 
-    << "  --verification-providers=<providers>   "
-    << "    List of providers used to verify result. (default: device)\n\n";
+    << "  --verification-providers=<providers>         "
+    << "    List of providers used to verify result. (default: '*')" << end_of_line
+    << "      Gemm verification-providers {cublas*}" << end_of_line
+    << "\n\n";
 }
 
 void Options::Verification::print_options(std::ostream &out, int indent) const {
@@ -554,22 +562,22 @@ void Options::Report::print_usage(std::ostream &out) const {
 
   out << "Report:\n"
 
-    << "  --append=<bool>                        "
+    << "  --append=<bool>                              "
     << "    If true, result is appended to possibly existing file. Otherwise, " << end_of_line
-    << "any existing file is overwritten.\n\n"
+    << "      any existing file is overwritten.\n\n"
 
-    << "  --output=<path>                        "
-    << "    Path to output file for machine readable results.\n\n"
+    << "  --output=<path>                              "
+    << "    Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n"
 
-    << "  --report-not-run=<bool>                "
+    << "  --report-not-run=<bool>                      "
     << "    If true, reports the status of all kernels including those that" << end_of_line
-    << "do not satisfy the given arguments.\n\n"
+    << "      do not satisfy the given arguments.\n\n"
 
-    << "  --tags=<column:tag,...>                "
+    << "  --tags=<column:tag,...>                      "
     << "    Inserts leading columns in output table and uniform values for each" << end_of_line
-    << "column. Useful for generating pivot tables.\n\n"
+    << "      column. Useful for generating pivot tables.\n\n"
 
-    << "  --verbose=<bool>                       "
+    << "  --verbose=<bool>                             "
     << "    Prints human-readable text to stdout. If false, nothing is written to stdout.\n\n";
 }
 
@@ -600,7 +608,7 @@ Options::About::About(cutlass::CommandLine const &cmdline) {
 void Options::About::print_usage(std::ostream &out) const {
 
   out << "About:\n"
-    << "  --version                                  ";
+    << "  --version                                        ";
 
   print_version(out);
 
@@ -675,22 +683,29 @@ Options::Options(cutlass::CommandLine const &cmdline):
 void Options::print_usage(std::ostream &out) const {
 
   out
-    << "CUTLASS Performance Tool\n"
+    << "CUTLASS Profiler\n"
     << "usage:\n\n"
     << "    cutlass_profiler [options]\n\n"
     << "  --help\n\n"
 
-    << "  --mode={profile*,single,dry,trace,enumerate} "
-    << "    Regular profiling, single kernel mode only, or no profiling.\n\n"
+    << "  --mode=<string>                              "
+    << "    Cutlass profiler execution mode." << end_of_line
+    << "       --mode=profile    regular verification and profiling (default)" << end_of_line
+    << "       --mode=dry_run    no kernels are launched or workspaces allocated" << end_of_line
+    << "       --mode=enumerate  lists all operation kind and operations" << end_of_line
+    << "       --mode=trace      executes a single device-side computation with" << end_of_line
+    << "                          no other kernel launches\n\n"
 
-    << "  --device-info                          "
+    << "  --device-info                                "
     << "    Prints information on all GPUs present in the system\n\n"
 
-    << "  --operation=<operation_kind>           "
+    << "  --operation=<operation_kind>                 "
     << "    CUTLASS operation to profile.\n\n"
 
-    << "  --kernels=<string_list>        "
-    << "    List of substrings to filter operations by name.\n\n"
+    << "  --kernels=<string_list>                      "
+    << "    Filter operations by kernel names. For example, call all kernels with" << end_of_line
+    << "      (\"s1688\" and \"nt\") or (\"s844\" and \"tn\" and \"align8\") in their" << end_of_line
+    << "      operation name using --kernels=\"s1688*nt, s884*tn*align8\"\n\n"
     ;
 
   //
@@ -755,4 +770,3 @@ std::string Options::indent_str(int indent) {
 
 } // namespace profiler
 } // namespace cutlass
-
diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h
index 4f723fa5..f4b5f0a1 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index 52a82099..0ab70449 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -68,9 +68,11 @@ PerformanceReport::PerformanceReport(
 ):
   options_(options), argument_names_(argument_names), problem_index_(0), good_(true), op_kind_(op_kind) {
 
-  std::string file_name = options_.report.output_path.substr(0, options_.report.output_path.rfind("."));
-  std::string file_extension = options_.report.output_path.substr(options_.report.output_path.rfind(".") + 1);
-  op_file_name_ = file_name + "." + to_string(op_kind_) + "." + file_extension;
+  // Strip '.csv' if present
+  std::string base_path = options_.report.output_path.substr(
+    0, options_.report.output_path.rfind(".csv"));
+
+  op_file_name_ = base_path + "." + to_string(op_kind_) + ".csv";
 
   //
   // Open output file for operation of PerformanceReport::op_kind
@@ -166,6 +168,7 @@ void PerformanceReport::close() {
 static const char *disposition_status_color(Disposition disposition) {
   switch (disposition) {
     case Disposition::kPassed: return SHELL_COLOR_GREEN();
+    case Disposition::kIncorrect: return SHELL_COLOR_RED();
     case Disposition::kFailed: return SHELL_COLOR_RED();
     default:
     break;
@@ -195,16 +198,17 @@ std::ostream & PerformanceReport::print_result_pretty_(
 
   out
     << "\n"
-    << "              Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n"
-    << "             Operation: " << result.operation_name << "\n\n"
-    << "                Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n"
-    << "          Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n"
-    << "           Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n";
+    << "        Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n"
+    << "   OperationKind: " << SHELL_COLOR_BRIGHT() << library::to_string(result.op_kind) << SHELL_COLOR_END() << "\n"
+    << "       Operation: " << result.operation_name << "\n\n"
+    << "          Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n"
+    << "    Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n"
+    << "     Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n";
 
   // Display individual verification results for each verification-provider
   if (options_.verification.enabled) {
 
-    static int const indent_spaces = 22;
+    static int const indent_spaces = 16;
 
     for(auto & m : result.verification_map) {
       out  << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";  
@@ -212,15 +216,15 @@ std::ostream & PerformanceReport::print_result_pretty_(
   }
 
   out
-    << "\n            Arguments: ";
+    << "\n       Arguments:";
 
   int column_idx = 0;
   for (auto const &arg : result.arguments) {
     if (!arg.second.empty()) {
       out << " --" << arg.first << "=" << arg.second; 
       column_idx += int(4 + arg.first.size() + arg.second.size());
-      if (column_idx > 90) {
-        out << "  \\\n              ";
+      if (column_idx > 98) {
+        out << "  \\\n                 ";
         column_idx = 0;
       }
     }
@@ -228,15 +232,15 @@ std::ostream & PerformanceReport::print_result_pretty_(
   out << "\n\n";
 
   out 
-    << "                 Bytes: " << result.bytes << "  bytes\n"
-    << "                 FLOPs: " << result.flops << "  flops\n\n";
+    << "           Bytes: " << result.bytes << "  bytes\n"
+    << "           FLOPs: " << result.flops << "  flops\n\n";
 
   if (result.good()) {
 
     out
-      << "             Runtime: " << result.runtime << "  ms\n"
-      << "              Memory: " << result.gbytes_per_sec() << " GiB/s\n"
-      << "\n              Math: " << result.gflops_per_sec() << " GFLOP/s\n";
+      << "         Runtime: " << result.runtime << "  ms\n"
+      << "          Memory: " << result.gbytes_per_sec() << " GiB/s\n"
+      << "\n            Math: " << result.gflops_per_sec() << " GFLOP/s\n";
 
   }
 
@@ -256,7 +260,7 @@ std::ostream & PerformanceReport::print_csv_header_(
 
   out 
     << (column_idx ? "," : "") << "Problem,Provider"
-    << ",Operation,Disposition,Status";
+    << ",OperationKind,Operation,Disposition,Status";
 
   for (auto const &arg_name : argument_names_) {
     out << "," << arg_name;
@@ -289,6 +293,7 @@ std::ostream & PerformanceReport::print_result_csv_(
     << (column_idx ? "," : "") 
     << result.problem_index
     << "," << to_string(result.provider, true)
+    << "," << to_string(result.op_kind)
     << "," << result.operation_name
     << "," << to_string(result.disposition)
     << "," << library::to_string(result.status);
diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h
index 573a049e..1c086e61 100644
--- a/tools/profiler/src/performance_report.h
+++ b/tools/profiler/src/performance_report.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu
new file mode 100644
index 00000000..86cabfb7
--- /dev/null
+++ b/tools/profiler/src/performance_result.cu
@@ -0,0 +1,55 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+// CUTLASS Profiler includes
+#include "enumerated_types.h"
+#include "performance_result.h"
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/src/performance_result.h
index 23eb60f2..9e3ebeb5 100644
--- a/tools/profiler/src/performance_result.h
+++ b/tools/profiler/src/performance_result.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -65,7 +65,7 @@ struct PerformanceResult {
   /// Outcome of verification (all verification results)
   DispositionMap verification_map;
 
-  /// Operation object
+  /// Operation name
   std::string operation_name;
 
   /// Stringified vector of argument values
@@ -119,3 +119,4 @@ using PerformanceResultVector = std::vector<PerformanceResult>;
 
 } // namespace profiler
 } // namespace cutlass
+
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index e95b9e1b..adede0ea 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -849,6 +849,47 @@ bool arg_as_OpcodeClassID(
   return arg_as_OpcodeClassID(opcode_class, value_ptr);
 }
 
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_SplitKModeID(
+  library::SplitKMode &split_k_mode,
+  KernelArgument::Value const *value_ptr) {
+
+  if (value_ptr->not_null) {
+    if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+
+      split_k_mode = library::from_string<library::SplitKMode>(
+        static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
+
+      if (split_k_mode == library::SplitKMode::kInvalid) {
+        throw std::runtime_error(
+          "arg_as_SplitKModeID() - illegal cast.");
+      }
+    }
+    else {
+
+      throw std::runtime_error(
+        "arg_as_SplitKModeID() - illegal cast.");
+    }
+    return true;
+  }
+  return false;
+}
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_SplitKModeID(
+  library::SplitKMode &split_k_mode,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  return arg_as_SplitKModeID(split_k_mode, value_ptr);
+}
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null.
 bool arg_as_scalar(
@@ -939,7 +980,6 @@ bool tensor_description_satisfies(
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace profiler
 } // namespace cutlass
 
diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h
index 8dfd216c..77a79ca2 100644
--- a/tools/profiler/src/problem_space.h
+++ b/tools/profiler/src/problem_space.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -811,6 +811,17 @@ bool arg_as_OpcodeClassID(
   ProblemSpace const &problem_space, 
   ProblemSpace::Problem const &problem);
 
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_SplitKModeID(library::SplitKMode &split_k_mode, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_SplitKModeID(
+  library::SplitKMode &split_k_mode,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
 /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null.
 bool arg_as_scalar(
   std::vector<uint8_t> &bytes,
diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt
index 51be4b54..0d2f86fb 100644
--- a/tools/util/CMakeLists.txt
+++ b/tools/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h
index 31fa7f34..c158ef97 100644
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are not permitted.
diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h
index 065a94e4..3ebbd4d8 100644
--- a/tools/util/include/cutlass/util/debug.h
+++ b/tools/util/include/cutlass/util/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h
index 2dd67c89..dac6029c 100644
--- a/tools/util/include/cutlass/util/device_dump.h
+++ b/tools/util/include/cutlass/util/device_dump.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 52229425..79b12368 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are not permitted.
diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h
index d9b61ca5..03377377 100644
--- a/tools/util/include/cutlass/util/distribution.h
+++ b/tools/util/include/cutlass/util/distribution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h
index ab5623bf..b6cf2fcd 100644
--- a/tools/util/include/cutlass/util/exceptions.h
+++ b/tools/util/include/cutlass/util/exceptions.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are not permitted.
diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h
index bb9ed621..d46d4594 100644
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h
index b43186a0..c734a5f5 100644
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
index a5e990cf..ed85cf22 100644
--- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h
+++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h
index 77a3076e..f75f8b88 100644
--- a/tools/util/include/cutlass/util/reference/detail/inner_product.h
+++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h
index 9dc66cca..5aef19ff 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
index 10ce474e..b3003409 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
index 6e389102..4c8e361e 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
index cf47c9a4..4d9de515 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index b7c2f073..64cb37be 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index dca50c2f..3323bed5 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
index 34ba2475..962ded09 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index aa6610e1..d03080b2 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
new file mode 100644
index 00000000..d78e1953
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+
+#pragma once
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_view.h"
+
+#include "cutlass/util/reference/device/tensor_foreach.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorReLuFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Coordinate in tensor's index space
+  using TensorCoord = typename TensorView::TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element threshold;
+
+
+    //
+    // Methods
+    //
+
+    Params(
+      TensorView view_ = TensorView(),
+      Element threshold_ = Element(0)
+    ):
+      view(view_), threshold(threshold_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorReLuFunc(Params const &params): params(params) {
+
+  }
+
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    Element const & value = params.view.at(coord);
+    params.view.at(coord) = (value < params.threshold) ? params.threshold : value;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Apply ReLu on a tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorReLu(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element threshold = Element(0)) {         ///< ReLu threshold
+  
+  using Func = detail::TensorReLuFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, threshold)
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
index fefc4131..11485a91 100644
--- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h
index 13dbd5cf..3e38886d 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
@@ -37,11 +37,41 @@
 #include "cutlass/tensor_view.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
 
 namespace cutlass {
 namespace reference {
 namespace host {
 
+template<typename Out, typename In>
+struct CastIfScalar {
+  static Out cast(In in) {
+    return Out(in);
+  }
+};
+
+template<typename OutScalar, typename In>
+struct CastIfScalar<cutlass::complex<OutScalar>, In> {
+  typedef cutlass::complex<OutScalar> Out;
+  static Out cast(In in) {
+    return Out(static_cast<OutScalar>(in));
+  }
+};
+
+template<typename OutScalar, typename InScalar>
+struct CastIfScalar<cutlass::complex<OutScalar>, cutlass::complex<InScalar>> {
+  typedef cutlass::complex<OutScalar> Out;
+  typedef cutlass::complex<InScalar> In;
+  static Out cast(In in) {
+    return Out(in);
+  }
+};
+
+template<typename Out, typename In>
+Out cast_if_scalar(In in) {
+  return CastIfScalar<Out, In>::cast(in);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
@@ -107,7 +137,10 @@ void compute_gemm(
               ElementA a = tensor_a.at(MatrixCoord(row, k_block));
               ElementB b = tensor_b.at(MatrixCoord(k_block, col));
 
-              accum[i][j] = inner_product_op(ComputeType(a), ComputeType(b),  accum[i][j]);
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+
+              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
             }
           }
         }
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
index 0f067691..27f36820 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
index 4d02747d..2a23fd27 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
index bf05a099..2d7545e9 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
index 737119e8..a81f0211 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
index 73eb328d..88bbb39f 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
index b298e4c2..87c14d61 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
index 23ee9f93..feb439d7 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
index 6c73d91f..1d494b9f 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h
index 590462f7..0043d745 100644
--- a/tools/util/include/cutlass/util/tensor_view_io.h
+++ b/tools/util/include/cutlass/util/tensor_view_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+* Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h
index 059a23ab..d97af0a4 100644
--- a/tools/util/include/cutlass/util/type_traits.h
+++ b/tools/util/include/cutlass/util/type_traits.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met: