Create PUBLICATIONS.md (#189 )

Update generator.py (#192 )
Merge pull request #187 from NVIDIA/cutlass_2.5
2021-03-03 11:17:40 -08:00 · 2021-03-02 12:21:48 -08:00 · 2021-02-26 23:56:04 -06:00 · 2021-02-26 15:46:57 -05:00 · 2021-02-26 15:32:19 -05:00 · 2021-02-26 13:55:04 -05:00
798 changed files with 62375 additions and 2311 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,33 @@

 # CUTLASS 2.x

+## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
+  * Tensor reductions
+    * _m_-to-_n_ reductions of tensors with affine layout
+    * [Specializations](/test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
+    * [Specializations](/test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
+    * Custom reduction functors such as `cutlass::logical_and`
+    * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
+  * Optimizations for 3-D convolution
+    * [Optimized tile iterators](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution
+    * Full coverage of [forward](test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution
+  * [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
+  * Corrections and bug fixes reported by the CUTLASS community
+    * Thank you for filing these issues!
+  
+
+## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19)
+  * Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs
+    * Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution
+    * Data type: FP32, complex<FP32>, Tensor Float 32 (TF32), BFloat16 (BF16), Float16, Int4, Int8, Int32
+    * Spatial dimensions: 1-D, 2-D, and 3-D
+    * Layout: NHWC, NCxHWx
+  * Implicit GEMM convolution components: 
+    * Global memory iterators supporting Fprop, Dgrad, and Wgrad
+    * `MmaMultistage` for implicit GEMM convolution for NVIDIA Ampere architecture
+    * `MmaPipeline` for implicit GEMM convolution for NVIDIA Volta and Turing architectures
+    * [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
+
 ## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
 * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
   * [Sparse Tensor Core GEMM kernels](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu):
@ -115,7 +142,7 @@

 ## Copyright

-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.

 ```
  Redistribution and use in source and binary forms, with or without modification, are permitted
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@ -32,7 +32,7 @@ endif()

 message(STATUS "CMake Version: ${CMAKE_VERSION}")

-project(CUTLASS VERSION 2.3.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.5.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)

 find_package(Doxygen QUIET)
@ -67,6 +67,8 @@ else()
  set(CUTLASS_ENABLE_TOOLS_INIT ON)
 endif()

+set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
+
 set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples")
 set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
 set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Library")
@ -114,10 +116,6 @@ if (POLICY CMP0076)
  cmake_policy(SET CMP0076 NEW)
 endif()

-if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
-    message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
-endif()
-
 include(GNUInstallDirs)

 link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -137,7 +135,12 @@ if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
 endif()

 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CUTLASS_LIBRARY_DEBUG_POSTFIX ".debug" CACHE STRING "Default postfix value for debug libraries")
+if (DEFINED CMAKE_DEBUG_POSTFIX)
+  set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT ${CMAKE_DEBUG_POSTFIX})
+else()
+  set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT .debug)
+endif()
+set(CUTLASS_LIBRARY_DEBUG_POSTFIX ${CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT} CACHE STRING "Default postfix value for debug libraries")

 if(WIN32)
  # On Windows we link against the shared (DLL) runtime. Change gtest settings to match this.
@ -192,7 +195,6 @@ endif()
 set(CUTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
 list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_TRACE_LEVEL})

-
 set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
  "Enable PTX mma instruction for collective matrix multiply operations.")

@ -253,6 +255,17 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo)
 endif()

+#Report CUDA build flags
+if (CUDA_COMPILER MATCHES "[Cc]lang")
+  if(CUTLASS_CUDA_CLANG_FLAGS)
+    message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}")
+  endif()
+else()
+  if(CUTLASS_CUDA_NVCC_FLAGS)
+    message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}")
+  endif()
+endif()
+
 if(CUDA_COMPILER MATCHES "[Cc]lang")
  if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
    message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
@ -314,20 +327,35 @@ function(cutlass_apply_cuda_gencode_flags TARGET)

 endfunction()

+# Cache the flags so they are available when the function below is called anywhere globally. 
+
+set(__CUTLASS_CUDA_FLAGS ${CUTLASS_CUDA_FLAGS} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS ${CUTLASS_CUDA_CLANG_FLAGS} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS_RELEASE ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_CLANG_FLAGS_DEBUG ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
+set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTERNAL "")
+
 function(cutlass_apply_standard_compile_options TARGET)

  if(CUDA_COMPILER MATCHES "[Cc]lang")
    set(CUDA_COMPILE_LANGUAGE CXX)
-    set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_CLANG_FLAGS})
-    set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
-    set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
-    set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
+    set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS})
+    set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
+    set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
+    set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
  else()
    set(CUDA_COMPILE_LANGUAGE CUDA)
-    set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_NVCC_FLAGS})
-    set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
-    set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
-    set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
+    set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_NVCC_FLAGS})
+    set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
+    set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
+    set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
  endif()

  target_compile_options(
@ -460,27 +488,203 @@ endif()

 ################################################################################

+include(CTest)
+enable_testing()
+if (NOT TARGET test_all)
+  add_custom_target(test_all)
+endif()
+
+set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
+set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
+
+set(CMAKE_TEST_INSTALL_PREFIX test CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(CUTLASS_TEST_INSTALL_PREFIX ${CMAKE_TEST_INSTALL_PREFIX}/cutlass CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(CUTLASS_TEST_INSTALL_BINDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+set(CUTLASS_TEST_INSTALL_LIBDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
+
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX})
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR})
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR})
+install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest)
+
+################################################################################
+
 include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)

 if (CUTLASS_ENABLE_CUBLAS)
  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
 endif()

+include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
+
+if (CUTLASS_ENABLE_CUDNN)
+  target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
+endif()
+
 ################################################################################

-if(CUTLASS_ENABLE_TOOLS)
+set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake)
+set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
+
+function(cutlass_add_executable_tests NAME TARGET)
+# 
+# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the 
+# <CMAKE_BINARY_DIR> or the <CMAKE_INSTALL_PREFIX>/<CUTLASS_TEST_INSTALL_PREFIX> after installation.
+# 
+# NAME: The base name for the test. Can be run with `make <NAME>` or `ctest -R 'c<NAME>'`.
+# TARGET: The target corresponding to the executable under test.
+# DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET.
+# DEPENDS: A list of targets or files on which this test is dependent.
+# DEPENDEES: A list of targets which should depend on this test.
+# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
+#   to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of 
+#   options given. If this option is not used, a single test with no arguments is generated.
+# 
+
+  set(options DISABLE_EXECUTABLE_INSTALL_RULE)
+  set(oneValueArgs)
+  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    
+  if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS)
+  
+    # file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+  
+    install(
+      TARGETS ${TARGET}
+      RUNTIME DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}
+      )
+  
+  endif()
+
+  if (NOT __TEST_COMMAND_OPTIONS)
+    set(__TEST_COMMAND_OPTIONS " ")
+  endif()
+
+  list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
+  set(CMD_IDX 0)
+
+  if (CMD_COUNT GREATER 1)
+    add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
+    foreach(DEPENDEE ${__DEPENDEES})
+      add_dependencies(${DEPENDEE} ${NAME})
+    endforeach()
+  endif()
+
+  foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
+
+    if (CMD_COUNT GREATER 1)
+      set(TEST_NAME ${NAME}_${CMD_IDX})
+    else()
+      set(TEST_NAME ${NAME})
+    endif()
+
+    # The following rigmarole is needed to deal with spaces and possible quotes in 
+    # command line arguments. The options are passed "by reference" as the actual
+    # variable names holding the real options. We then expand these in a way that
+    # preserves any quotes. Note, they have to be in this order for it to work for 
+    # all the use cases below.
+
+    set(CMD_OPTIONS ${${CMD_OPTIONS}})
+    list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
+    separate_arguments(CMD_OPTIONS)
+   
+    add_custom_target(
+      ${TEST_NAME}
+      COMMAND
+      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      DEPENDS
+      ${TARGET}
+      )
+
+    if (CMD_COUNT GREATER 1)
+      add_dependencies(${NAME} ${TEST_NAME})
+    endif()
+
+    foreach(DEPENDEE ${__DEPENDEES})
+      add_dependencies(${DEPENDEE} ${TEST_NAME})
+    endforeach()
+
+    add_test(
+      NAME c${TEST_NAME}
+      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      )
+
+    if (CUTLASS_INSTALL_TESTS)
+
+      # To run the tests from an install package with tests enabled, we need to generate test files
+      # that don't rely on the current directory structure in build.  
+
+      set(TEST_NAME c${TEST_NAME})
+      set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
+      set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
+      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
+
+      file(GENERATE 
+        OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
+        INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
+        )
+  
+      install(
+        FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
+        DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
+        )
+  
+      set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
+  
+    endif()
+
+    math(EXPR CMD_IDX "${CMD_IDX} + 1")
+
+  endforeach()
+
+endfunction()
+
+if (CUTLASS_ENABLE_TOOLS)
  add_subdirectory(tools)
+  if (CUTLASS_ENABLE_PROFILER)
+    add_dependencies(test_all test_profiler)
+  endif()  
 endif()
-if(CUTLASS_ENABLE_EXAMPLES)
+if (CUTLASS_ENABLE_EXAMPLES)
  add_subdirectory(examples)
+  add_dependencies(test_all test_examples)
 endif()

-if(CUTLASS_ENABLE_TESTS)
-  include(CTest)
-  enable_testing()
+if (CUTLASS_ENABLE_TESTS)
  add_subdirectory(test)
+  add_dependencies(test_all test_unit)
 endif()

+if (CUTLASS_INSTALL_TESTS)
+
+  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/cmake")
+
+  file(WRITE "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "# Generated File\n")
+  foreach(GENERATED_FILE ${CUTLASS_CTEST_GENERATED_FILES})
+    file(APPEND "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
+  endforeach()
+
+  install(
+    FILES "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake"
+    DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+    )
+
+endif()
+
+#? install(
+#?   FILES ${CMAKE_BINARY_DIR}/CTestTestfile.cmake
+#?   DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+#?   )
+#? 
+#? install(
+#?   DIRECTORY 
+#?     ${CMAKE_BINARY_DIR}/tools
+#?     ${CMAKE_BINARY_DIR}/test
+#?   DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+#?   FILES_MATCHING PATTERN "CTestTestfile.cmake"
+#?   )
+
 ################################################################################

 install(
--- a/CUDA.cmake
+++ b/CUDA.cmake
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@ -204,7 +204,7 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 # paths by default, so we add it explicitly here.

 function(cutlass_correct_source_file_language_property)
-  if(CUDA_COMPILER MATCHES "clang")
+  if(CUDA_COMPILER MATCHES "[Cc]lang")
    foreach(File ${ARGN})
      if(File MATCHES ".*\.cu$")
        set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
--- a/PUBLICATIONS.md
+++ b/PUBLICATIONS.md
@ -0,0 +1,8 @@
+# Publications Using Cutlass
+
+## 2020
+
+- ["Scalable Knowledge Graph Analytics at 136 Petaflop/s"](https://www.computer.org/csdl/proceedings-article/sc/2020/999800a061/1oeORDgCM0g). Ramakrishnan Kannan, Piyush Sao, Hao Lu, Drahomira Herrmannova, Vijay Thakkar,  Robert Patton, Richard Vuduc, Thomas Potok. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2020.
+
+- ["Accelerating Sparse DNN Models without Hardware-Support via Tile-Wise Sparsity
+"](https://arxiv.org/abs/2008.13006). Cong Guo, Bo Yang Hsueh, Jingwen Leng, Yuxian Qiu, Yue Guan, Zehuan Wang, Xiaoying Jia, Xipeng Li, Minyi Guo, Yuhao Zhu. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2020.
--- a/README.md
+++ b/README.md
@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")

-# CUTLASS 2.3
+# CUTLASS 2.5

-_CUTLASS 2.3 - September 2020_
+_CUTLASS 2.5 - February 2021_

 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@ -25,18 +25,34 @@ Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations
 targeting the  programmable, high-throughput _Tensor Cores_ implemented by 
 NVIDIA's Volta, Turing, and Ampere architectures.

+Additionaly, CUTLASS implements high-performance convolution (implicit GEMM). 
+Implicit GEMM is the formulation of a convolution operation as a GEMM. This allows CUTLASS 
+to build convolutions by reusing highly optimized warp-wide GEMM components and below. 
+
 See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.

-See the [functionality listing](media/docs/functionality.md) for the list of operations
+See the [functionality listing](/media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.

+# What's New in CUTLASS 2.5
+CUTLASS 2.5 is a minor update to CUTLASS adding:
+- [Tensor reductions](/test/unit/reduction/device/tensor_reduce_contiguous.cu)
+- [Optimizations for 3-D convolution](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h)
+- [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
+- See the [CHANGELOG](CHANGELOG.md) for more details
+
+# What's New in CUTLASS 2.4
+CUTLASS 2.4 is a significant update to CUTLASS adding:
+- 1-D, 2-D, and 3-D convolution targeting Tensor and CUDA cores for NVIDIA Ampere, Turing, and Volta GPU architectures
+- CUTLASS profiler support for convolution
+- [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
+
 # What's New in CUTLASS 2.3

 CUTLASS 2.3 is a minor update to CUTLASS adding:
 - GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
 - Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
 - Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
- See the [CHANGELOG](CHANGELOG.md) for more details.

 # What's New in CUTLASS 2.2

@ -118,6 +134,7 @@ CUTLASS is described in the following documents and the accompanying
 - [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS
 - [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
 - [GEMM API](media/docs/gemm_api.md) - describes the CUTLASS GEMM model and C++ template concepts 
+- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
 - [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
 - [Terminology](media/docs/terminology.md) - describes terms used in the code
 - [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
@ -140,7 +157,7 @@ CUTLASS unit tests, examples, and utilities can be build with CMake starting ver
 Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
 on your system.

-```
+```bash
 $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 ```

@ -149,7 +166,7 @@ for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce
 the architectures to build CUTLASS for by changing the CMake configuration setting
 `CUTLASS_NVCC_ARCHS`.

-```
+```bash
 $ mkdir build && cd build

 $ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA's Ampere Architecture
@ -160,7 +177,7 @@ From the `build/` directory, compile and run the CUTLASS unit tests by building
 The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS,
 and they may be executed in parallel via make's `-j` command line argument.

-```
+```bash
 $ make test_unit -j
 ...
 ...
@ -191,6 +208,8 @@ include/                     # client applications should target this directory

    arch/                    # direct exposure of architecture features (including instruction-level GEMMs)

+    conv/                    # code specialized for convolution
+
    gemm/                    # code specialized for general matrix product computations

    layout/                  # layout definitions for matrices, tensors, and other mathematical objects in memory
@ -210,34 +229,39 @@ include/                     # client applications should target this directory

 ```
 examples/
-  00_basic_gemm/             # launches a basic GEMM with single precision inputs and outputs
+  00_basic_gemm/                   # launches a basic GEMM with single precision inputs and outputs

-  01_cutlass_utilities/      # demonstrates CUTLASS Utilities for allocating and initializing tensors
+  01_cutlass_utilities/            # demonstrates CUTLASS Utilities for allocating and initializing tensors
  
-  02_dump_reg_smem/          # debugging utilities for printing register and shared memory contents
+  02_dump_reg_smem/                # debugging utilities for printing register and shared memory contents
  
-  03_visualize_layout/       # utility for visualizing all layout functions in CUTLASS
+  03_visualize_layout/             # utility for visualizing all layout functions in CUTLASS

-  04_tile_iterator/          # example demonstrating an iterator over tiles in memory
+  04_tile_iterator/                # example demonstrating an iterator over tiles in memory

-  05_batched_gemm/           # example demonstrating CUTLASS's batched strided GEMM operation
+  05_batched_gemm/                 # example demonstrating CUTLASS's batched strided GEMM operation

-  06_splitK_gemm/            # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel
+  06_splitK_gemm/                  # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel

-  07_volta_tensorop_gemm/    # example demonstrating mixed precision GEMM using Volta Tensor Cores
+  07_volta_tensorop_gemm/          # example demonstrating mixed precision GEMM using Volta Tensor Cores

-  08_turing_tensorop_gemm/   # example demonstrating integer GEMM using Turing Tensor Cores
+  08_turing_tensorop_gemm/         # example demonstrating integer GEMM using Turing Tensor Cores

-  10_planar_complex/         # example demonstrating planar complex GEMM kernels
+  09_turing_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Turing Tensor Cores

-  11_planar_complex_array/   # example demonstrating planar complex kernels with batch-specific problem sizes
+  10_planar_complex/               # example demonstrating planar complex GEMM kernels

-  12_gemm_bias_relu/         # example demonstrating GEMM fused with bias and relu
+  11_planar_complex_array/         # example demonstrating planar complex kernels with batch-specific problem sizes

-  13_fused_two_gemms/        # example demonstrating two GEMms fused in one kernel
+  12_gemm_bias_relu/               # example demonstrating GEMM fused with bias and relu
+
+  13_fused_two_gemms/              # example demonstrating two GEMms fused in one kernel
+
+  22_ampere_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Ampere Tensor Cores
 ```

 ### Tools
+
 ```
 tools/
  library/                   # CUTLASS Instance Library - contains instantiations of all supported CUTLASS templates
@ -266,30 +290,85 @@ Instructions for building and running the Unit tests are described in the [Quick
 The `tools/profiler/` directory contains a command-line utility for launching each of the GEMM kernels.
 It can be built as follows:

-```
+```bash
 $ make cutlass_profiler -j16
 ```
+## Building all GEMM and Convolution kernels (_long_ build times)

 By default, only one tile size is instantiated for each data type, math instruction, and layout.
 To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
 Beware, this results in *thousands* of kernels and long build times.
-```
+```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
 ...
 $ make cutlass_profiler -j16
 ```

-To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with 
-wildcard characters may be reduce the set of kernels. The following builds exactly one kernel:
+## Building a subset of GEMM and Convolution kernels (_reduced_ build times)

-```
-$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
+To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with 
+wildcard characters may be used to reduce the set of kernels. The following examples show building exactly one
+or a subset of kernels for NVIDIA Ampere and Turing architecture:
+
+### Building a subset Tensor Core GEMM kernels
+
+To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture, 
+use the below cmake command line:
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
 ...
 $ make cutlass_profiler -j16
 ```

-Example command line for profiling SGEMM kernels is as follows:
+Example command line for profiling a subset of Tensor Core GEMM kernels is as follows:
+```bash
+./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*gemm_f16_*_nt_align8 --m=3456 --n=4096 --k=4096
+
+...
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: gemm
+       Operation: cutlass_tensorop_s1688gemm_f16_256x128_32x2_nt_align8
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+          cuBLAS: Passed
+
+       Arguments: --gemm_kind=universal --m=3456 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f32:column --alpha=1  \
+                  --beta=0 --split_k_slices=1 --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128  \
+                  --cta_k=32 --stages=2 --warps_m=4 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8 --min_cc=75  \
+                  --max_cc=1024
+
+           Bytes: 118489088  bytes
+           FLOPs: 115992428544  flops
+
+         Runtime: 1.55948  ms
+          Memory: 70.7616 GiB/s
+
+            Math: 74378.8 GFLOP/s
+
+
+
+=============================
+...
 ```
+
+### Building one CUDA Core GEMM kernel
+
+To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
+...
+$ make cutlass_profiler -j16
+```
+
+Example command line for profiling single SGEMM CUDA kernel is as follows:
+```bash
 $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096

 =============================
@ -316,9 +395,111 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
          Memory: 24.934 GiB/s

            Math: 17218.4 GFLOP/s
+
+=============================
 ```

-[Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
+### Building a subset of Tensor Core Convolution kernels
+
+To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation 
+and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
+...
+$ make cutlass_profiler -j16
+```
+
+Example command line for profiling a subset of Tensor Core convolution kernels is as follows:
+
+```bash
+$ ./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*fprop_optimized_f16 --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
+
+...
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: conv2d
+       Operation: cutlass_tensorop_s16816fprop_optimized_f16_128x128_32x5_nhwc
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+
+       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
+                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f16:nhwc --Filter=f16:nhwc --Output=f32:nhwc  \
+                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
+                  --eq_gemm_provider=none --op_class=tensorop --accum=f32 --cta_m=128 --cta_n=128 --cta_k=32 --stages=5  \
+                  --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024
+
+           Bytes: 1130659840  bytes
+           FLOPs: 118482796544  flops
+
+         Runtime: 0.711496  ms
+          Memory: 1479.99 GiB/s
+
+            Math: 166526 GFLOP/s
+
+=============================
+...
+```
+
+
+### Building one Convolution CUDA kernel
+
+To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation 
+and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
+...
+$ make cutlass_profiler -j16
+```
+
+Example command line for profiling one CUDA Core convolution kernel:
+
+```bash
+$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
+
+
+=============================
+  Problem ID: 1
+
+        Provider: CUTLASS
+   OperationKind: conv2d
+       Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
+
+          Status: Success
+    Verification: ON
+     Disposition: Passed
+
+reference_device: Passed
+
+       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
+                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc  \
+                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
+                  --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
+                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
+
+           Bytes: 2055798784  bytes
+           FLOPs: 118482796544  flops
+
+         Runtime: 7.34266  ms
+          Memory: 260.752 GiB/s
+
+            Math: 16136.2 GFLOP/s
+
+
+=============================
+
+```
+
+## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
+- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
+  - [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples) 
+  - [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
+- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)


 # About
@ -332,7 +513,7 @@ The official list of CUTLASS developers and contributors is available here: [CON

 # Copyright

-Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.

 ```
  Redistribution and use in source and binary forms, with or without modification, are permitted
--- a/cmake/CTestTestfile.config.cmake
+++ b/cmake/CTestTestfile.config.cmake
@ -0,0 +1,19 @@
+# Generated file
+
+if (DEFINED ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
+  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
+else()
+  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT @CUTLASS_TEST_EXECUTION_ENVIRONMENT@)
+endif()
+
+if (NOT "@TEST_EXE_DIR@" STREQUAL "")
+  set(TEST_EXE_PATH @TEST_EXE_DIR@/@TEST_EXE@)
+else()
+  set(TEST_EXE_PATH @TEST_EXE@)
+endif()
+
+add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
+
+if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "")
+  set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@")
+endif()
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@ -1,3 +1,24 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 message(STATUS "Configuring cublas ...")

--- a/cuDNN.cmake
+++ b/cuDNN.cmake
@ -0,0 +1,107 @@
+
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if(DEFINED CUDNN_ENABLED)
+    set(CUTLASS_ENABLE_CUDNN ${CUDNN_ENABLED} CACHE BOOL "Enable CUTLASS to build with cuDNN library.")
+endif()
+
+if(DEFINED CUTLASS_ENABLE_CUDNN AND NOT CUTLASS_ENABLE_CUDNN)
+  return()
+endif()
+  
+message(STATUS "Configuring cuDNN ...")
+
+find_path(
+    _CUDNN_INCLUDE_DIR cudnn.h
+    PATHS
+    ${CUDA_TOOLKIT_ROOT_DIR}/include
+    $ENV{CUDNN_PATH}/include
+    $ENV{CUDA_PATH}/include
+    ${CUDNN_PATH}/include
+    /usr/include)
+
+find_library(
+    _CUDNN_LIBRARY cudnn
+    HINTS
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib
+    $ENV{CUDNN_PATH}/lib64
+    $ENV{CUDNN_PATH}/lib/x64
+    $ENV{CUDNN_PATH}/lib
+    $ENV{CUDA_PATH}/lib64
+    $ENV{CUDA_PATH}/lib/x64
+    $ENV{CUDA_PATH}/lib
+    ${CUDNN_PATH}/lib64
+    ${CUDNN_PATH}/lib/x64
+    ${CUDNN_PATH}/lib
+    /usr/lib/x86_64-linux-gnu
+    /usr/lib)
+
+if(_CUDNN_INCLUDE_DIR AND _CUDNN_LIBRARY)
+
+    message(STATUS "cuDNN: ${_CUDNN_LIBRARY}")
+    message(STATUS "cuDNN: ${_CUDNN_INCLUDE_DIR}")
+    
+    set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found")
+
+else()
+
+    message(STATUS "cuDNN not found.")
+    set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Found")
+
+endif()
+
+set(CUTLASS_ENABLE_CUDNN ${CUDNN_FOUND} CACHE BOOL "Enable CUTLASS to build with cuDNN library.")
+
+if (CUTLASS_ENABLE_CUDNN AND NOT TARGET cudnn)
+
+  set(CUDNN_INCLUDE_DIR ${_CUDNN_INCLUDE_DIR})
+  set(CUDNN_LIBRARY ${_CUDNN_LIBRARY})
+
+  if(WIN32)
+    add_library(cudnn STATIC IMPORTED GLOBAL)
+  else()
+    add_library(cudnn SHARED IMPORTED GLOBAL)
+  endif()
+
+  add_library(nvidia::cudnn ALIAS cudnn)
+
+  set_property(
+    TARGET cudnn
+    PROPERTY IMPORTED_LOCATION
+    ${CUDNN_LIBRARY})
+    
+  target_include_directories(
+    cudnn
+    INTERFACE
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CUDNN_INCLUDE_DIR}>)
+
+endif()
+
+if(CUTLASS_ENABLE_CUDNN AND NOT CUDNN_FOUND)
+  message(FATAL_ERROR "CUTLASS_ENABLE_CUDNN enabled but cuDNN library could not be found.")
+endif()
+
+message(STATUS "Configuring cuDNN ... done.")
--- a/examples/00_basic_gemm/CMakeLists.txt
+++ b/examples/00_basic_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/01_cutlass_utilities/CMakeLists.txt
+++ b/examples/01_cutlass_utilities/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/02_dump_reg_shmem/CMakeLists.txt
+++ b/examples/02_dump_reg_shmem/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu
+++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 *modification, are permitted provided that the following conditions are met:
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@ -20,9 +20,15 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+set(TEST_COMMAND_00 RowMajor --extent=16,16)
+set(TEST_COMMAND_01 "ColumnMajorInterleaved<4>" --extent=32,8 --output-shape=16 --vectorize=4)
+
 cutlass_example_add_executable(
  03_visualize_layout
  visualize_layout.cpp
  register_layout.cu
+  TEST_COMMAND_OPTIONS
+  TEST_COMMAND_00
+  TEST_COMMAND_01
  )

--- a/examples/03_visualize_layout/options.h
+++ b/examples/03_visualize_layout/options.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/03_visualize_layout/register_layout.h
+++ b/examples/03_visualize_layout/register_layout.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -32,6 +32,8 @@
 #include <iomanip>
 #include <memory>

+#include <cutlass/cutlass.h>
+
 #include "options.h"
 #include "register_layout.h"

@ -133,6 +135,8 @@ int main(int argc, char const *arg[]) {

  layout_it->second->print_csv(std::cout);

+  cudaFree(0); // Ensure CUDA is available.
+
  return 0;
 }

--- a/examples/03_visualize_layout/visualize_layout.h
+++ b/examples/03_visualize_layout/visualize_layout.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/04_tile_iterator/CMakeLists.txt
+++ b/examples/04_tile_iterator/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/05_batched_gemm/CMakeLists.txt
+++ b/examples/05_batched_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/05_batched_gemm/batched_gemm.cu
+++ b/examples/05_batched_gemm/batched_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/07_volta_tensorop_gemm/CMakeLists.txt
+++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -284,8 +284,12 @@ int run() {
  // Instantiate CUTLASS kernel depending on templates
  Gemm gemm_op;

+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
  // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
  CUTLASS_CHECK(status);

  // Launch initialized CUTLASS kernel
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -188,31 +188,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,

 int run() {

-  // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 10.2. 
-  //
-  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-    return -1;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!((props.major * 10 + props.minor) >= 75)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
-              << std::endl;
-
-    // Return 0 so tests are considered passing if run on unsupported platforms.
-    return 0;
-  }
-
  const int length_m = 5120;
  const int length_n = 4096;
  const int length_k = 4096;
@ -291,8 +266,12 @@ int run() {
  // Instantiate CUTLASS kernel depending on templates
  Gemm gemm_op;

+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
  // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
  CUTLASS_CHECK(status);

  // Launch initialized CUTLASS kernel
@ -337,18 +316,37 @@ int run() {
 }

 int main() {
+  bool notSupported = false;
+
  // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
  // in CUDA 10.2. 
  //
  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    notSupported = true;
+  }

-    // Returning zero so this test passes when built on older Toolkits. 
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 75)) {
+    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-  else {
-    return run();
-  }
+
+  return run();
 }

--- a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@ -20,14 +20,9 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+
 cutlass_example_add_executable(
-  13_fused_two_gemms
-  fused_gemm.cu
-  )
-
-target_include_directories(
-  13_fused_two_gemms
-  PRIVATE
-  .
+  09_turing_tensorop_conv2dfprop
+  turing_tensorop_conv2dfprop.cu
  )

--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@ -0,0 +1,764 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+
+This example shows how to run convolution kernels using functions and data structures
+provided by CUTLASS using tensor cores; which we run on a NVIDIA Turing GPU.
+
+Writing a single high performance convolution kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. CUTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
+of GPU easily.
+
+CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+
+In thie example, we split variable initialization into
+1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set tensors will be used to compute
+output of convolution.
+
+First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
+with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In CUTLASS,
+the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as
+alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as 
+epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to 
+ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit
+signed integer. But int4b_t is not fully supported by Nvidia software stack, so CUTLASS introduces
+cutlass::int4b_t. We use the data type for elements in input tensor A and B as cutlass::int4b_t. We
+convey this to CUTLASS kernel by initializing template variables ElementAccumulator (int32_t),
+ElementComputeEpilogue (float), ElementInputA (cutlass::int4b_t), ElementInputB (cutlass::int4b_t),
+ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out 
+linearly in memory, we have to convey the layout of tensors. We do that by initializing template
+variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
+rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
+variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of
+elements per vector memory access (32), data type of accumulator (int32_t) and data type of
+computation of linear combination (alpha * X + beta * C).
+
+Now that we setup the properties of data, we have to setup properties of computation.
+
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
+64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
+internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
+data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
+from understanding and coding complicated hardware optimizations which can easily go wrong.
+
+CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
+constitute the whole process of loading input data from global memory to shared memory, loading data
+from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
+sequence shows a typical mma pipeline.
+
+tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
+output to global memory
+
+The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
+until the previous finished executing. There are stages in the pipeline which do not have fixed
+latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
+pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
+Finally, the pipeline in a kernel looks like
+
+(1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
+mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) tensor in global
+memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
+(9) output to global memory
+
+This way, you can hide the second global memory load latency by doing computation on already loaded
+input data.
+
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
+
+These are all put together to create a template variable which describes CUTLASS Implicit GEMM
+kernel using cutlass::conv::device::ImplicitGemm template.
+
+The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
+in the way of learning CUTLASS.
+
+Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
+kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
+R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to intialize CUTLASS kernel then, the kernel is launched.
+
+In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
+compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements
+using ElementAccumulator = int32_t;                 // Data type of accumulator
+using ElementComputeEpilogue = float;               // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::int4b_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::int4b_t;             // Data type of elements in input tensor
+using ElementOutput = cutlass::int4b_t;             // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm75;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 128>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 2;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombinationClamp<
+    ElementOutput,                                     // Data type of output matrix.
+    8,                                                 // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAddSaturate,
+  cutlass::conv::IteratorAlgorithm::kAnalytic
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool benchmark;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32),
+    filter_size(32, 3, 3, 32),
+    padding(1, 1, 1, 1),
+    conv_stride(1, 1),
+    dilation(1, 1),
+    reference_check(false),
+    measure_performance(true),
+    iterations(20),
+    save_workspace(false),
+    alpha(1),
+    beta(0),
+    benchmark(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of int4b_t elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 32 elements.
+    //
+    int const kAlignment = 32;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) ||
+      (padding.w() != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor4DCoord input_size,
+    cutlass::Tensor4DCoord filter_size) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    if (cmd.check_cmd_line_flag("benchmark")) {
+      benchmark = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    filter_size.c() = input_size.c(); 
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = {1, 1, 1, 1};
+    }
+    else {
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = {0, 0, 0, 0};
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "09_turing_tensorop_conv2dfprop example\n\n"
+      << "  This example uses Turing's Tensor Core operators on int4 data types to compute\n"
+      << "  forward convolution on tensors of layout NHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n <int>            Input tensor extent N\n"
+      << "  --h <int>            Input tensor extent H\n"
+      << "  --w <int>            Input tensor extent W\n"
+      << "  --c <int>            Input tensor extent C\n"
+      << "  --k <int>            Filter extent K\n"
+      << "  --r <int>            Filter extent R\n"
+      << "  --s <int>            Filter extent S\n\n"
+      << "  --alpha <float>      Epilogue scalar alpha\n"
+      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check on the host is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
+      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --save-workspace     If set, workspace is written to a text file.\n"
+      << "  --tag <string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+      input_size.n(),
+      (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+      (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out 
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(7),
+      ElementInputB(-8),
+      0);
+
+  // Fill tensor C on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_c.host_view());
+
+  // Fill tensor C for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_c.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_ref_c.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  // mode (kCrossCorrelation or kConvolution)
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(      
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices);
+
+  // Construct ImplicitGemm::Argument structure with conv2d 
+  // problem size, data pointers, and epilogue values
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {options.alpha, options.beta},
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  ImplicitGemm implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+  
+  if (options.reference_check) {
+    std::cout << "Verification on host...\n";
+
+    // Compute with reference implementation
+    cutlass::reference::host::Conv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_a.host_ref(),
+      tensor_b.host_ref(),
+      tensor_c.host_ref(),
+      tensor_ref_c.host_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_c.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_c.host_view(),
+      tensor_ref_c.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    }
+    else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  }
+  else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+
+    std::stringstream ss;
+
+    ss << "09_tensor_conv_workspace_conv2dfprop_"
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << "_"
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace 
+      << "Input = \n" << tensor_a.host_view() << "\n\n"
+      << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+  
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 7 || (props.major == 7 && props.minor >= 5))) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+    return 0;
+  }
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.benchmark) {
+    // Benchmark several layers
+
+    int batch_sizes[] = {1, 32, 64, 128, 256, 512};
+
+    struct Benchmark {
+      int h, w, c, k, r, s;
+    } layers[] = {
+      {56,  56,   64,   256, 1, 1},
+      {56,  56,   64,    64, 1, 1},
+      {56,  56,   64,    64, 3, 3},
+      {56,  56,  256,    64, 1, 1},
+      {56,  56,  256,   512, 1, 1},
+      {56,  56,  256,   128, 1, 1},
+      {28,  28,  128,   128, 3, 3},
+      {28,  28,  128,   512, 1, 1},
+      {28,  28,  512,   128, 1, 1},
+      {28,  28,  512,  1024, 1, 1},
+      {28,  28,  512,   256, 1, 1},
+      {14,  14,  256,   256, 3, 3},
+      {14,  14,  256,  1024, 1, 1},
+      {14,  14,  1024,  256, 1, 1},
+      {14,  14,  1024, 2048, 1, 1},
+      {14,  14,  1024,  512, 1, 1},
+      {7,    7,   512,  512, 3, 3},
+    };
+
+    Result::print_header(std::cout, options) << std::endl;
+
+    int idx = 1;
+
+    for (auto const &layer : layers) {
+      for (auto N : batch_sizes) {
+
+        options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
+
+        Result result = profile_convolution(options);
+        result.print(std::cout, idx, options) << std::endl;
+      }
+
+      ++idx;
+    }
+  }
+  else {
+
+    // Execute one problem size
+    if (!options.valid()) {
+      std::cerr << "Invalid problem." << std::endl;
+      return -1;
+    }
+
+    Result result = profile_convolution(options);
+
+    Result::print_header(std::cout, options) << std::endl;
+    result.print(std::cout, 1, options) << std::endl;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
--- a/examples/10_planar_complex/CMakeLists.txt
+++ b/examples/10_planar_complex/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/11_planar_complex_array/CMakeLists.txt
+++ b/examples/11_planar_complex_array/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -48,11 +48,19 @@ using ElementInputA = cutlass::half_t;              // <- data type of elements
 using ElementInputB = cutlass::half_t;              // <- data type of elements in input matrix B
 using ElementOutput = float;                        // <- data type of elements in output matrix D

-// The code section below describes matrix layout of input and output matrices. Column Major for
-// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+// The code section below describes matrix layout of input and output matrices.
+// Column Major for Matrix A, B and C.
+//
+// Note this example only works for ColumnMajor output because
+//   1) we only have row major epilogue.
+//   2) we swap A and B if the output is column major then we can still use the
+//      row major epilogue.
+//   3) Mx1 bias vector becomes 1xM after the swapping/transposing.
+//   4) we can use the existing OutputIterator to load 1xM bias vector.
+
 using LayoutInputA = cutlass::layout::ColumnMajor;
 using LayoutInputB = cutlass::layout::ColumnMajor;
-using LayoutOutput = cutlass::layout::RowMajor;
+using LayoutOutput = cutlass::layout::ColumnMajor;

 // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
 using MMAOp = cutlass::arch::OpClassTensorOp;
@ -73,17 +81,18 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw

 // Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
 //
-//    d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij )
+//    d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij )
 //
 using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
-    ElementOutput,                                     // <- data type of output matrix
-    128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- this is the number of elements per
-                                                       // vectorized memory access. For half
-                                                       // precision, it's 8 elements. This becomes
-                                                       // the vector width of math instructions in
-                                                       // epilogue too
-    ElementAccumulator,                                // <- data type of accumulator
-    ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+    ElementOutput,                                        // <- data type of output matrix
+    128 / cutlass::sizeof_bits<ElementOutput>::value,     // <- this is the number of elements per
+                                                          // vectorized memory access. For half
+                                                          // precision, it's 8 elements. This becomes
+                                                          // the vector width of math instructions in
+                                                          // epilogue too
+    ElementAccumulator,                                   // <- data type of accumulator
+    ElementComputeEpilogue,                               // <- data type for alpha in linear combination function
+    cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // <- alpha x C + bias

 // Number of pipelines you want to use
 constexpr int NumStages = 2;
@ -106,21 +115,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,

 int run() {

-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
-              << std::endl;
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
  const int length_m = 5120;
  const int length_n = 4096;
  const int length_k = 4096;
@ -175,9 +169,8 @@ int run() {
  tensor_d.sync_device();
  tensor_ref_d.sync_device();

-  // Initialize alpha and beta for dot product computation
+  // Initialize alpha for dot product computation
  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
-  ElementComputeEpilogue beta = ElementComputeEpilogue(0);

  // Split K dimension into 1 partitions
  int split_k_slices = 1;
@ -193,7 +186,7 @@ int run() {
                                        //    to project away the N dimension by setting the stride to zero.

    tensor_d.device_ref(),              // <- reference to matrix D on device
-    {alpha, beta},                      // <- tuple of alpha and beta
+    {alpha},                              // <- alpha
    split_k_slices};                    // <- k-dimension split factor

  // Using the arguments, query for extra workspace required for matrix multiplication computation
@ -205,8 +198,12 @@ int run() {
  // Instantiate CUTLASS kernel depending on templates
  Gemm gemm_op;

+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
  // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
  CUTLASS_CHECK(status);

  // Launch initialized CUTLASS kernel
@ -248,7 +245,7 @@ int run() {
    for (int j = 0; j < problem_size.n(); ++j) {
      tensor_ref_d.at({i, j}) = std::max(
        ElementOutput(0), 
-        ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0}))
+        ElementOutput(tensor_ref_d.at({i, j}) + tensor_c_bias.at({i, 0}))
      );
    }
  }
@ -265,17 +262,36 @@ int run() {
 }

 int main() {
+
+  bool notSupported = false;
+
  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
  //
  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    notSupported = true;
+  }

+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-  else {
-    return run();
-  }
+
+  return run();
 }
  
--- a/examples/13_two_tensor_op_fusion/CMakeLists.txt
+++ b/examples/13_two_tensor_op_fusion/CMakeLists.txt
@ -0,0 +1,45 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  13_fused_two_gemms
+  fused_gemm.cu
+  )
+
+cutlass_example_add_executable(
+  13_fused_two_convs
+  fused_conv2d.cu
+  )
+
+
+target_include_directories(
+  13_fused_two_gemms
+  PRIVATE
+  .
+  )
+
+target_include_directories(
+  13_fused_two_convs
+  PRIVATE
+  .
+  )
+
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@ -0,0 +1,76 @@
+# Introduction
+
+This example shows fusing two back-to-back GEMMs/Convolutions into one kernel. 
+
+<p align="center"><img src=/media/images/13_example_fusion.png></p>
+
+When running two unfused GEMM/Conv operations, each operation loads one input 
+activation matrix, one weight matrix (or filter matrix) from the memory and then 
+stores the result activation matrix back to the memory.
+
+When the two GEMM/Conv operations are fused together, the mainloops of the two
+GEMMs/Convs run back to back in a single kernel. The output accumulator of the
+1st GEMM/Conv will be stored in the register file and reused as the activation
+input of the 2nd GEMM/Conv. This saves a round trip to memory for the activation
+matrix.
+
+
+This example computes the following:
+- 1st GEMM/Conv: D0 = relu(alpha0 .\* A0 \*\* B0)
+- 2nd GEMM/Conv: D1 = relu(alpha1 .\* D0 \*\* B1 + beta1 .\* C1)
+
+In the above equation, operator \*\* can be matrix multiplication or convolution operation.
+
+# Implementation Details
+
+In order to run two GEMM/Convs in a single kernel, the example requires the same number of
+threadblocks are used across 2 GEMMs/Convs. This also ensures the same threadblock tile M across
+2 GEMMs/Convs.
+
+In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the 
+input activation, the example enforces the following two constraints:
+
+- thread_block_tile_N = problem_N 
+
+<p align="center"><img src=/media/images/13_example_block_resident_fusion.png></p>
+
+This constraint ensures that each threadblock loads the entire weight/filter matrix in
+addition to its own input activation tile. Therefore the input activation tile of the
+2nd GEMM/Conv only depends on the output activation tile of the 1st GEMM/Conv, and the
+operation can be fully block-resident.
+
+- warp_tile_N = thread_block_tile_N 
+
+<p align="center"><img src=/media/images/13_example_rf_resident_fusion.png></p>
+
+This constraint ensures that each warp loads the entire weight/filter kBlock in
+addition to its own input activation tile. Therefore the input activation warp tile of the
+2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
+register file, and the operation can be fully register-file-resident.
+
+# Copyright
+
+Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+```
+  Redistribution and use in source and binary forms, with or without modification, are permitted
+  provided that the following conditions are met:
+      * Redistributions of source code must retain the above copyright notice, this list of
+        conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright notice, this list of
+        conditions and the following disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+        to endorse or promote products derived from this software without specific prior written
+        permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
+
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_f16_sm75() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_f16_sm75() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_f16_sm75() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
@ -0,0 +1,363 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_f16_sm80() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_f16_sm80() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_f16_sm80() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
@ -0,0 +1,367 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_interleaved_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_interleaved_conv2d_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
+    {128, 56, 56, 64},    // input size (NHWC)
+    {64, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {128, 56, 56, 64}     // output size (NPQK)
+  );
+
+void run_nonfused_conv2d_fprop_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+void run_fused_conv2d_fprop_optimized_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+
+template <typename Conv2d0_, typename Conv2d1_>
+class B2bNonFusedConv2dRun {
+public:
+
+  using Conv2d0 = Conv2d0_;
+  using Conv2d1 = Conv2d1_;
+  using ElementAccumulator = typename Conv2d0::ElementAccumulator;
+  using ElementCompute = typename Conv2d0::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
+  static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator, 
+        "Fused convolution operators must be the same");
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bNonFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_computed.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    Conv2d0 conv2d_op_0;
+    Conv2d1 conv2d_op_1;
+
+    typename Conv2d0::Arguments conv2d_args_0(
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_computed.device_ref(),
+      {alpha0, beta0},
+      split_k_mode
+    );
+    typename Conv2d1::Arguments conv2d_args_1(
+      problem_size_1,
+      tensor_D0_computed.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+
+    cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
+
+    CUTLASS_CHECK(status);
+
+    status = conv2d_op_1.initialize(conv2d_args_1);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run Conv2d
+    //
+    cudaEvent_t start, stop1, stop2;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop1);
+    cudaEventCreate(&stop2);
+
+    cudaEventRecord(start);
+
+
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop1);    
+    
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop2);
+    cudaDeviceSynchronize();
+    float conv2d0Time, conv2d1Time, totalTime;
+    cudaEventElapsedTime(&conv2d0Time, start, stop1);
+    cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
+    cudaEventElapsedTime(&totalTime, start, stop2);
+    std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
+    std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
+    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+
+    tensor_D0_computed.sync_host();
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d0::ElementA,
+      typename Conv2d0::LayoutA,
+      typename Conv2d0::ElementB,
+      typename Conv2d0::LayoutB,
+      typename Conv2d0::ElementC,
+      typename Conv2d0::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+    
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d1::ElementA,
+      typename Conv2d1::LayoutA,
+      typename Conv2d1::ElementB,
+      typename Conv2d1::LayoutB,
+      typename Conv2d1::ElementC,
+      typename Conv2d1::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
+        << "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+template <typename B2bConv2d_>
+class B2bFusedConv2dRun {
+public:
+
+  using B2bConv2d = B2bConv2d_;
+  using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
+  using ElementCompute = typename B2bConv2d::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    B2bConv2d b2b_conv2d_op;
+
+    typename B2bConv2d::Arguments b2b_conv2d_args(
+      problem_size_0,
+      problem_size_1,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+    cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run the Conv2d
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < runs; i++) {
+
+        // run conv2d operator
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+    
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float conv2dTime;
+    cudaEventElapsedTime(&conv2dTime, start, stop);
+    std::cout << "time " << conv2dTime / (float)runs << " ms\n";
+
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -43,14 +43,15 @@

 ////////////////////////////////////////////////////////////////////////////////

+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);
+
 void run_nonfused_gemm_f16() {

  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
  ElementCompute alpha0 = ElementCompute(2);
  ElementCompute beta0 = ElementCompute(0);
  ElementCompute alpha1 = ElementCompute(2);
@ -110,7 +111,7 @@ void run_nonfused_gemm_f16() {
  B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;

  std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
-  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
  if(pass)
    std::cout << "Pass\n";
  else
@ -123,8 +124,6 @@ void run_fused_gemm_f16() {
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
  ElementCompute alpha0 = ElementCompute(2);
  ElementCompute beta0 = ElementCompute(0);
  ElementCompute alpha1 = ElementCompute(2);
@ -178,7 +177,7 @@ void run_fused_gemm_f16() {
  B2bFusedGemmRun<B2bGemm> fusedGemm;

  std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
-  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
  if(passed)
    std::cout << "Pass\n";
  else
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_gemm_run.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);
+
+void run_nonfused_gemm_f16_sm80() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+
+  B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
+  bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+}
+
+void run_fused_gemm_f16_sm80() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+
+  B2bFusedGemmRun<B2bGemm> fusedGemm;
+
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
+  bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -121,7 +121,9 @@ struct B2bNonFusedGemmRun
    ElementCompute beta0 = ElementCompute(0),
    ElementCompute alpha1 = ElementCompute(1), 
    ElementCompute beta1 = ElementCompute(0),
-    bool relu = true) {
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
    
    //
    // Allocate the GEMM workspace
@ -222,6 +224,14 @@ struct B2bNonFusedGemmRun
    status = gemm_op_1.initialize(arguments_1);

    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = gemm_op_0();
+        CUTLASS_CHECK(status);
+        status = gemm_op_1();
+        CUTLASS_CHECK(status);
+    }
+
    //
    // Run the GEMM
    //
@ -233,13 +243,13 @@ struct B2bNonFusedGemmRun

    cudaEventRecord(start);

-    for(int i = 0; i < 100; i++) {
+    for(int i = 0; i < runs; i++) {
        status = gemm_op_0();
    
        CUTLASS_CHECK(status);
    }
    cudaEventRecord(stop1);
-    for(int i = 0; i < 100; i++) {
+    for(int i = 0; i < runs; i++) {
    
        status = gemm_op_1();
    
@ -252,9 +262,9 @@ struct B2bNonFusedGemmRun
    cudaEventElapsedTime(&gemm0Time, start, stop1);
    cudaEventElapsedTime(&gemm1Time, stop1, stop2);
    cudaEventElapsedTime(&totalTime, start, stop2);
-    std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
-    std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
-    std::cout << "total time " << totalTime / 100.0 << " ms\n";
+    std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
+    std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
+    std::cout << "total time " << totalTime / (float)runs << " ms\n";

    tensor_D0.sync_host();
    tensor_D1.sync_host();
@ -415,7 +425,9 @@ struct B2bFusedGemmRun
    ElementCompute beta0 = ElementCompute(0),
    ElementCompute alpha1 = ElementCompute(1), 
    ElementCompute beta1 = ElementCompute(0),
-    bool relu = true) {
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
    
    //
    // Allocate the GEMM workspace
@ -433,10 +445,6 @@ struct B2bFusedGemmRun
      typename B2bGemm::ElementC, 
      typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());

-//    cutlass::HostTensor<
-//      typename B2bGemm::ElementC, 
-//      typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
-
    cutlass::HostTensor<
      typename B2bGemm::ElementC, 
      typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
@ -503,6 +511,11 @@ struct B2bFusedGemmRun

    CUTLASS_CHECK(status);

+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_gemm_op();
+        CUTLASS_CHECK(status);
+    }
+
    //
    // Run the GEMM
    //
@ -513,7 +526,7 @@ struct B2bFusedGemmRun

    cudaEventRecord(start);

-    for(int i = 0; i < 100; i++) {
+    for(int i = 0; i < runs; i++) {
        status = b2b_gemm_op();

        CUTLASS_CHECK(status);
@ -523,9 +536,8 @@ struct B2bFusedGemmRun
    cudaDeviceSynchronize();
    float gemmTime;
    cudaEventElapsedTime(&gemmTime, start, stop);
-    std::cout << "time " << gemmTime / 100.0 << " ms\n";
+    std::cout << "time " << gemmTime / (float)runs << " ms\n";

-    //tensor_D0.sync_host();
    tensor_D1.sync_host();

    //
@ -593,7 +605,6 @@ struct B2bFusedGemmRun
        << "A0 =\n" << tensor_A0.host_view()
        << "\nB0 =\n" << tensor_B0.host_view()
        << "\nC0 =\n" << tensor_C0.host_view()
-//        << "\nD0 =\n" << tensor_D0.host_view()
        << "\nB1 =\n" << tensor_B1.host_view()
        << "\nC1 =\n" << tensor_C1.host_view()
        << "\n\nReference =\n" << reference_D1.host_view()
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -43,14 +43,15 @@

 ////////////////////////////////////////////////////////////////////////////////

+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);
+
 void run_nonfused_gemm_s8() {

  using ElementOutput = int8_t;
  using ElementAccumulator = int32_t;
  using ElementCompute = float;

-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
  ElementCompute alpha0 = ElementCompute(2);
  ElementCompute beta0 = ElementCompute(0);
  ElementCompute alpha1 = ElementCompute(2);
@ -110,7 +111,7 @@ void run_nonfused_gemm_s8() {
  B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;

  std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
  if(pass)
    std::cout << "Pass\n";
  else
@ -123,8 +124,6 @@ void run_fused_gemm_s8() {
  using ElementAccumulator = int32_t;
  using ElementCompute = float;

-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
  ElementCompute alpha0 = ElementCompute(2);
  ElementCompute beta0 = ElementCompute(0);
  ElementCompute alpha1 = ElementCompute(2);
@ -178,7 +177,7 @@ void run_fused_gemm_s8() {
  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;

  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
  if(passed)
    std::cout << "Pass\n";
  else
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -43,14 +43,15 @@

 ////////////////////////////////////////////////////////////////////////////////

+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);
+
 void run_nonfused_gemm_s8_sm80() {

  using ElementOutput = int8_t;
  using ElementAccumulator = int32_t;
  using ElementCompute = float;

-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
  ElementCompute alpha0 = ElementCompute(2);
  ElementCompute beta0 = ElementCompute(0);
  ElementCompute alpha1 = ElementCompute(2);
@ -86,8 +87,7 @@ void run_nonfused_gemm_s8_sm80() {
    16,
    16,
    false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
  >;
  using Gemm1 = cutlass::gemm::device::Gemm<
    int8_t,
@ -113,14 +113,13 @@ void run_nonfused_gemm_s8_sm80() {
    16,
    16,
    false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
  >;

  B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;

  std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
  if(pass)
    std::cout << "Pass\n";
  else
@ -133,8 +132,6 @@ void run_fused_gemm_s8_sm80() {
  using ElementAccumulator = int32_t;
  using ElementCompute = float;

-  cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
-  cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
  ElementCompute alpha0 = ElementCompute(2);
  ElementCompute beta0 = ElementCompute(0);
  ElementCompute alpha1 = ElementCompute(2);
@ -193,7 +190,7 @@ void run_fused_gemm_s8_sm80() {
  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;

  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
-  bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
  if(passed)
    std::cout << "Pass\n";
  else
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
@ -0,0 +1,661 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/host_reorder.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+
+template <typename Conv2d0_, typename Conv2d1_, int InterleavedK>
+class B2bInterleavedNonFusedConv2dRun {
+public:
+
+  using Conv2d0 = Conv2d0_;
+  using Conv2d1 = Conv2d1_;
+  using ElementAccumulator = typename Conv2d0::ElementAccumulator;
+  using ElementCompute = typename Conv2d0::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
+  static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator, 
+        "Fused convolution operators must be the same");
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0_reordered;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
+  cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1_reordered;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bInterleavedNonFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    //Reorder B0 and B1
+    cutlass::reorder_convK<InterleavedK, InterleavedK>(
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
+    cutlass::reorder_convK<InterleavedK, InterleavedK>(
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_B0_reordered.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_computed.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_B1_reordered.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    Conv2d0 conv2d_op_0;
+    Conv2d1 conv2d_op_1;
+
+    typename Conv2d0::Arguments conv2d_args_0(
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0_reordered.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_computed.device_ref(),
+      {alpha0, beta0},
+      split_k_mode
+    );
+    typename Conv2d1::Arguments conv2d_args_1(
+      problem_size_1,
+      tensor_D0_computed.device_ref(),
+      tensor_B1_reordered.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+
+    cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
+
+    CUTLASS_CHECK(status);
+
+    status = conv2d_op_1.initialize(conv2d_args_1);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run Conv2d
+    //
+    cudaEvent_t start, stop1, stop2;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop1);
+    cudaEventCreate(&stop2);
+
+    cudaEventRecord(start);
+
+
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_0();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop1);    
+    
+    for(int i = 0; i < runs; i++) {
+        // run conv2d operator
+        status = conv2d_op_1();
+        CUTLASS_CHECK(status);
+    }
+    cudaEventRecord(stop2);
+    cudaDeviceSynchronize();
+    float conv2d0Time, conv2d1Time, totalTime;
+    cudaEventElapsedTime(&conv2d0Time, start, stop1);
+    cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
+    cudaEventElapsedTime(&totalTime, start, stop2);
+    std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
+    std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
+    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+
+    tensor_D0_computed.sync_host();
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d0::ElementA,
+      typename Conv2d0::LayoutA,
+      typename Conv2d0::ElementB,
+      typename Conv2d0::LayoutB,
+      typename Conv2d0::ElementC,
+      typename Conv2d0::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename Conv2d0::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+    
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename Conv2d1::ElementA,
+      typename Conv2d1::LayoutA,
+      typename Conv2d1::ElementB,
+      typename Conv2d1::LayoutB,
+      typename Conv2d1::ElementC,
+      typename Conv2d1::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename Conv2d1::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_interleaved_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
+        << "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+template <typename B2bConv2d_, int InterleavedK>
+class B2bInterleavedFusedConv2dRun {
+public:
+
+  using B2bConv2d = B2bConv2d_;
+  using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
+  using ElementCompute = typename B2bConv2d::ElementCompute;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0_reordered;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
+
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
+  cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1_reordered;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
+  cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
+
+
+public:
+
+  B2bInterleavedFusedConv2dRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 16) {
+        scope = 2;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
+        
+    tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
+    tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
+    tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
+    tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+    tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
+
+    initialize_tensor(tensor_A0.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); 
+    initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
+
+    //Reorder B0 and B1
+    cutlass::reorder_convK<16, InterleavedK>(
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
+    cutlass::reorder_convK<InterleavedK, InterleavedK>(
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_B0_reordered.sync_device();
+    tensor_C0.sync_device();
+    tensor_D0_reference.sync_device();
+    tensor_B1.sync_device();
+    tensor_B1_reordered.sync_device();
+    tensor_C1.sync_device();
+    tensor_D1_computed.sync_device();
+    tensor_D1_reference.sync_device();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size_0,
+    cutlass::conv::Conv2dProblemSize const &problem_size_1,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    initialize(problem_size_0, problem_size_1);
+
+    // configure the operator
+    B2bConv2d b2b_conv2d_op;
+
+    typename B2bConv2d::Arguments b2b_conv2d_args(
+      problem_size_0,
+      problem_size_1,
+      tensor_A0.device_ref(),
+      tensor_B0_reordered.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_B1_reordered.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_computed.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      split_k_mode
+    );
+
+    cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run the Conv2d
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < runs; i++) {
+
+        // run conv2d operator
+        status = b2b_conv2d_op();
+        CUTLASS_CHECK(status);
+    }
+    
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float conv2dTime;
+    cudaEventElapsedTime(&conv2dTime, start, stop);
+    std::cout << "time " << conv2dTime / (float)runs << " ms\n";
+
+    tensor_D1_computed.sync_host();
+    
+    bool passed = false;
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      tensor_C0.device_ref(),
+      tensor_D0_reference.device_ref(),
+      alpha0, 
+      beta0);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); 
+    }
+
+    cutlass::reference::device::Conv2d<
+      typename B2bConv2d::ElementA,
+      typename B2bConv2d::LayoutA,
+      typename B2bConv2d::ElementB,
+      typename B2bConv2d::LayoutB,
+      typename B2bConv2d::ElementC,
+      typename B2bConv2d::LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size_1,
+      tensor_D0_reference.device_ref(),
+      tensor_B1.device_ref(),
+      tensor_C1.device_ref(),
+      tensor_D1_reference.device_ref(),
+      alpha1, 
+      beta1);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); 
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    CHECK_TRUE(result == cudaSuccess);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D0_reference.sync_host();
+    tensor_D1_reference.sync_host();
+    
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D1_computed.host_view(), 
+      tensor_D1_reference.host_view());
+
+    CHECK_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_B2bImplicitGemm_device_interleaved_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream results(fname.str());
+
+      results << problem_size_0 << std::endl;
+      results << problem_size_1 << std::endl;
+
+      results
+        << "\nA0:\n" << tensor_A0.host_view() << "\n"
+        << "\nB0:\n" << tensor_B0.host_view() << "\n"
+        << "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
+        << "\nC0:\n" << tensor_C0.host_view() << "\n"
+        << "\nB1:\n" << tensor_B1.host_view() << "\n"
+        << "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
+        << "\nC1:\n" << tensor_C1.host_view() << "\n"
+        << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
+        << "\nD1 computed:\n" << tensor_D1_computed.host_view();
+
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -243,6 +243,7 @@ struct B2bInterleavedNonFusedGemmRun
        status = gemm_op_1();
        CUTLASS_CHECK(status);
    }
+
    //
    // Run the GEMM
    //
@ -455,10 +456,6 @@ struct B2bInterleavedFusedGemmRun
      typename B2bGemm::ElementC, 
      typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());

-//    cutlass::HostTensor<
-//      typename B2bGemm::ElementC, 
-//      typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
-
    cutlass::HostTensor<
      typename B2bGemm::ElementC, 
      typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
@ -507,7 +504,6 @@ struct B2bInterleavedFusedGemmRun
    tensor_B0.sync_device();
    tensor_B0_reordered.sync_device();
    tensor_C0.sync_device();
-    //tensor_D0.sync_device();
    tensor_B1.sync_device();
    tensor_B1_reordered.sync_device();
    tensor_C1.sync_device();
@ -566,7 +562,6 @@ struct B2bInterleavedFusedGemmRun
    cudaEventElapsedTime(&gemmTime, start, stop);
    std::cout << "time " << gemmTime / (float)runs << " ms\n";

-    //tensor_D0.sync_host();
    tensor_D1.sync_host();

    //
@ -635,7 +630,6 @@ struct B2bInterleavedFusedGemmRun
        << "\nB0 =\n" << tensor_B0.host_view()
        << "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
        << "\nC0 =\n" << tensor_C0.host_view()
-//        << "\nD0 =\n" << tensor_D0.host_view()
        << "\nB1 =\n" << tensor_B1.host_view()
        << "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
        << "\nC1 =\n" << tensor_C1.host_view()
--- a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
@ -0,0 +1,274 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Implicit GEMM
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "kernel/default_b2b_conv2d_fprop.h"
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+template<typename B2bImplicitGemmKernel_>
+class B2bImplicitGemmConvolution {
+public:
+
+  using B2bImplicitGemmKernel = B2bImplicitGemmKernel_;
+
+  using ElementA = typename B2bImplicitGemmKernel::ElementA;
+  using LayoutA = typename B2bImplicitGemmKernel::LayoutA;
+  using ElementB = typename B2bImplicitGemmKernel::ElementB;
+  using LayoutB = typename B2bImplicitGemmKernel::LayoutB;
+  using ElementC = typename B2bImplicitGemmKernel::ElementC;
+  using LayoutC = typename B2bImplicitGemmKernel::LayoutC;
+  using ElementAccumulator = typename B2bImplicitGemmKernel::ElementAccumulator;
+  using ElementCompute = typename B2bImplicitGemmKernel::ElementCompute;
+  using OperatorClass = typename B2bImplicitGemmKernel::OperatorClass;
+  using ArchTag = typename B2bImplicitGemmKernel::ArchTag;
+  using ThreadblockShape0 = typename B2bImplicitGemmKernel::ThreadblockShape0;
+  using ThreadblockShape1 = typename B2bImplicitGemmKernel::ThreadblockShape1;
+  using WarpShape0 = typename B2bImplicitGemmKernel::WarpShape0;
+  using WarpShape1 = typename B2bImplicitGemmKernel::WarpShape1;
+  using InstructionShape = typename B2bImplicitGemmKernel::InstructionShape;
+  using ThreadblockSwizzle = typename B2bImplicitGemmKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp0 = typename B2bImplicitGemmKernel::EpilogueOutputOp0;
+  using EpilogueOutputOp1 = typename B2bImplicitGemmKernel::EpilogueOutputOp1;
+  static int const kStages = B2bImplicitGemmKernel::kStages;
+  static int const kConvDim = B2bImplicitGemmKernel::kConvDim;
+  using WarpMmaOperator0 = typename B2bImplicitGemmKernel::WarpMmaOperator0;
+  using WarpMmaOperator1 = typename B2bImplicitGemmKernel::WarpMmaOperator1;
+  using ArchMmaOperator = typename B2bImplicitGemmKernel::ArchMmaOperator;
+  using MathOperator = typename B2bImplicitGemmKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = B2bImplicitGemmKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = B2bImplicitGemmKernel::kIteratorAlgorithm;
+
+  static int const kWarpCount = 
+    (ThreadblockShape0::kM / WarpShape0::kM) * 
+    (ThreadblockShape0::kN / WarpShape0::kN);
+
+  /// Argument structure
+  using Arguments = typename B2bImplicitGemmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename B2bImplicitGemmKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  B2bImplicitGemmConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = B2bImplicitGemmKernel::B2bMma::IteratorA0::can_implement(args.problem_size_0);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = B2bImplicitGemmKernel::B2bMma::IteratorB0::can_implement(args.problem_size_0);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = B2bImplicitGemmKernel::B2bMma::IteratorB1::can_implement(args.problem_size_1);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
+        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+        args.problem_size_0.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
+        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+        args.problem_size_0.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size_0)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size_0.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    if (args.problem_size_0.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename B2bImplicitGemmKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<B2bImplicitGemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+
+      result = cudaFuncSetAttribute(
+          cutlass::Kernel<B2bImplicitGemmKernel>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A0 = args.ref_A0.data();
+    params_.ptr_B0 = args.ref_B0.data();
+    params_.ptr_C0 = args.ref_C0.data();
+    params_.ptr_B1 = args.ref_B1.data();
+    params_.ptr_C1 = args.ref_C1.data();
+    params_.ptr_D1 = args.ref_D1.data();
+    params_.output_op_0 = args.output_op_0;
+    params_.output_op_1 = args.output_op_1;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
+
+    cutlass::Kernel<B2bImplicitGemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/13_two_tensor_op_fusion/fused_conv2d.cu
+++ b/examples/13_two_tensor_op_fusion/fused_conv2d.cu
@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h"
+#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h"
+#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
+#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"
+
+int run() {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+  std::cout << "Running on SM80" << std::endl;
+  run_nonfused_conv2d_fprop_optimized_f16_sm80();
+  run_fused_conv2d_fprop_optimized_f16_sm80();
+  run_nonfused_conv2d_fprop_optimized_s8_sm80();
+  run_fused_conv2d_fprop_optimized_s8_sm80();
+#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+  std::cout << "Running on SM75" << std::endl;
+  run_nonfused_conv2d_fprop_optimized_f16_sm75();
+  run_fused_conv2d_fprop_optimized_f16_sm75();
+  run_nonfused_conv2d_fprop_optimized_s8_sm75();
+  run_fused_conv2d_fprop_optimized_s8_sm75();
+#endif
+
+  return 0;
+}
+
+int main() {
+
+  bool notSupported = false;
+
+  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+  //
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+    std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+
+    notSupported = true;
+
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+    
+  return run();
+}
+
--- a/examples/13_two_tensor_op_fusion/fused_gemm.cu
+++ b/examples/13_two_tensor_op_fusion/fused_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -22,59 +22,22 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
-/*
-
-This example shows fusing two GEMM mainloops into one kernel. The first GEMM computes relu(alpha*A*B) and 
-the second GEMM computes relu(alpha*A*B+beta*C). The performance measuring environment compares against
-two unfused GEMM operations, demonstrating a speedup of the fused kernel on the 
-NVIDIA Turing GPU architecture.
-
-Problem size:
-  GEMM1 (M,N,K): 128*1600, 64, 576
-  GEMM2 (M,N,K): 128*1600, 128, 64
-
-Note that GEMM1_N = GEMM2_K
-
-The example requires the number of threadblocks be the same across 2 GEMMs and 
-thread_block_tile_N = problem_N so the data required by each layer is threadblock-resident. It 
-also requires warp_tile_N = thread_block_tile_N so the data required by each warp is 
-register-file-resident.
-
-Performance:
-  - fp16 on Tesla T4 @ 1590MHz (non-fused vs. fused): 1.39011 ms vs. 1.26035 ms
-  - int8 on Tesla T4 @ 1590MHz (non-fused vs. fused): 0.751759 ms vs. 0.62971 ms
-  - fp16 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.721144 ms vs. 0.629864 ms
-  - int8 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.379049 ms vs. 0.324764 ms
-  - int8 on GA100 @ 1200MHz (non-fused vs. fused): 0.153795 ms vs. 0.129874 ms
-
-*/

 #include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h"
+#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h"
 #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
 #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"

 int run() {

-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
-              << std::endl;
-
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+  std::cout << "Running on SM80" << std::endl;
+  run_nonfused_gemm_f16_sm80();
+  run_fused_gemm_f16_sm80();
  run_nonfused_gemm_s8_sm80();
  run_fused_gemm_s8_sm80();
 #elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+  std::cout << "Running on SM75" << std::endl;
  run_nonfused_gemm_f16();
  run_fused_gemm_f16();
  run_nonfused_gemm_s8();
@ -85,17 +48,38 @@ int run() {
 }

 int main() {
+
+  bool notSupported = false;
+
  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
  //
-  // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
+  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;

+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major * 10 + props.minor >= 75)) {
+    std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
+              << std::endl;
+
+    notSupported = true;
+  }
+
+  if (notSupported) {
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-  else {
-    return run();
-  }
+
+  return run();
 }

--- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -335,7 +335,7 @@ struct B2bGemm {
      semaphore.fetch();

      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op_1.set_k_partition(threadblock_tile_offset.k());
+      output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
    }

    // Tile iterator loading from source tensor.
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
@ -0,0 +1,475 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,                               ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct B2bImplicitGemmConvolution {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp0 = typename B2bMma::OutputOp;
+  using EpilogueOutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename B2bMma::IteratorA0::Element;
+  using LayoutA = typename B2bMma::IteratorA0::Layout;
+  using ElementB = typename B2bMma::IteratorB0::Element;
+  using LayoutB = typename B2bMma::IteratorB0::Layout;
+  using ElementC = typename EpilogueOutputOp1::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp0::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp0::ElementCompute;
+
+  using WarpMmaOperator0 = typename B2bMma::Policy0::Operator;
+  using WarpMmaOperator1 = typename B2bMma::Policy1::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator0::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator0::OperatorClass;
+  using ArchTag = typename WarpMmaOperator0::ArchTag;
+
+  using ThreadblockShape0 = typename B2bMma::Shape0;
+  using ThreadblockShape1 = typename B2bMma::Shape1;
+  using WarpShape0 = typename WarpMmaOperator0::Shape;
+  using WarpShape1 = typename WarpMmaOperator1::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = B2bMma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = B2bMma::IteratorA0::kIteratorAlgorithm; 
+ 
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  using TensorRefA0 = typename B2bMma::IteratorA0::TensorRef;
+  using TensorRefB0 = typename B2bMma::IteratorB0::TensorRef;
+  using TensorRefB1 = typename B2bMma::IteratorB1::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::B2bImplicitGemmConvolution::kConvDim
+  static_assert(B2bMma::IteratorA0::kConvDim == B2bMma::IteratorB0::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = B2bMma::IteratorA0::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    cutlass::platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size_0;
+    ConvProblemSize problem_size_1;
+    TensorRefA0 ref_A0;
+    TensorRefB0 ref_B0;
+    TensorRefC ref_C0;
+    TensorRefB1 ref_B1;
+    TensorRefC ref_C1;
+    TensorRefC ref_D1;
+    typename EpilogueOutputOp0::Params output_op_0;
+    typename EpilogueOutputOp1::Params output_op_1;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size_0,
+      ConvProblemSize const & problem_size_1
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size_0,
+      ConvProblemSize const & problem_size_1,
+      TensorRefA0 const & ref_A0,
+      TensorRefB0 const & ref_B0,
+      TensorRefC const & ref_C0,
+      TensorRefB1 const & ref_B1,
+      TensorRefC const & ref_C1,
+      TensorRefC const & ref_D1,
+      typename EpilogueOutputOp0::Params const & output_op_0,
+      typename EpilogueOutputOp1::Params const & output_op_1,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      ref_A0(ref_A0),
+      ref_B0(ref_B0),
+      ref_C0(ref_C0),
+      ref_B1(ref_B1),
+      ref_C1(ref_C1),
+      ref_D1(ref_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size_0;
+    ConvProblemSize problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size_0;
+    gemm::GemmCoord implicit_gemm_problem_size_1;
+    int gemm_k_iterations_0;
+    int gemm_k_iterations_1;
+    typename B2bMma::IteratorA0::Params iterator_A0;
+    typename B2bMma::IteratorA0::Element const *ptr_A0;
+    typename B2bMma::IteratorB0::Params iterator_B0;
+    typename B2bMma::IteratorB0::Element const *ptr_B0;
+    typename Epilogue::OutputTileIterator::Params iterator_C0;
+    typename Epilogue::OutputTileIterator::Element *ptr_C0;
+    typename B2bMma::IteratorB1::Params iterator_B1;
+    typename B2bMma::IteratorB1::Element const *ptr_B1;
+    typename Epilogue::OutputTileIterator::Params iterator_C1;
+    typename Epilogue::OutputTileIterator::Element *ptr_C1;
+    typename Epilogue::OutputTileIterator::Params iterator_D1;
+    typename Epilogue::OutputTileIterator::Element *ptr_D1;
+    typename EpilogueOutputOp0::Params output_op_0;
+    typename EpilogueOutputOp1::Params output_op_1;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size_0(args.problem_size_0),
+      problem_size_1(args.problem_size_1),
+      implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
+      implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
+      grid_tiled_shape(grid_tiled_shape),
+      iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
+      ptr_A0(args.ref_A0.data()),
+      iterator_B0(args.problem_size_0, args.ref_B0.layout()),
+      ptr_B0(args.ref_B0.data()),
+      iterator_C0(ConvOutputIteratorParameter::layout(args.ref_C0)),
+      ptr_C0(args.ref_C0.data()),
+      iterator_B1(args.problem_size_1, args.ref_B1.layout()),
+      ptr_B1(args.ref_B1.data()),
+      iterator_C1(ConvOutputIteratorParameter::layout(args.ref_C1)),
+      ptr_C1(args.ref_C1.data()),
+      iterator_D1(ConvOutputIteratorParameter::layout(args.ref_D1)),
+      ptr_D1(args.ref_D1.data()),
+      output_op_0(args.output_op_0),
+      output_op_1(args.output_op_1),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations_0 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape0::kK, args.problem_size_0);
+      gemm_k_iterations_1 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape1::kK, args.problem_size_1);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size_0,
+        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+        args.problem_size_0.split_k_slices);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  B2bImplicitGemmConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename B2bMma::IteratorA0 iterator_A0(
+      params.iterator_A0,
+      params.problem_size_0,
+      params.ptr_A0,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * B2bMma::Shape0::kM,
+        threadblock_tile_idx.k() * B2bMma::Shape0::kK
+      )
+    );
+    
+    typename B2bMma::IteratorB0 iterator_B0(
+      params.iterator_B0,
+      params.problem_size_0,
+      params.ptr_B0,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * B2bMma::Shape0::kK,
+        threadblock_tile_idx.n() * B2bMma::Shape0::kN
+      )
+    );
+
+    typename B2bMma::IteratorB1 iterator_B1(
+      params.iterator_B1,
+      params.problem_size_1,
+      params.ptr_B1,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * B2bMma::Shape1::kK,
+        threadblock_tile_idx.n() * B2bMma::Shape1::kN
+      )
+    );
+
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    EpilogueOutputOp0 output_op_0(params.output_op_0);
+
+    // Construct thread-scoped matrix multiply
+    B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename B2bMma::FragmentC0 src_accum;
+    typename B2bMma::FragmentC1 accumulators;
+
+    src_accum.clear();
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp1 output_op_1(params.output_op_1);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op_1.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * B2bMma::Shape1::kM,
+      threadblock_tile_idx.n() * B2bMma::Shape1::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D1(
+      params.iterator_D1,
+      params.ptr_D1,
+      ConvOutputIteratorParameter::extent(params.problem_size_1),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C1(
+      params.iterator_C1,
+      params.ptr_C1,
+      ConvOutputIteratorParameter::extent(params.problem_size_1),
+      thread_idx,
+      threadblock_offset
+    );
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C1 = iterator_D1;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+      __threadfence();
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D1.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size_1));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
@ -1,29 +1,28 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
- * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
+
 /*! \file
    \brief 
      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
@ -118,6 +117,75 @@ template <
 >
 struct DefaultB2bGemm;

+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, 
+      InstructionShape, Stages, Operator, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+
 ////////////////////////////////////////////////////////////////////////////////

 /// Partial specialization for Turing Architecture
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
@ -0,0 +1,757 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator) 
+    typename FragmentIteratorA1_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bImplicitGemmMultistage : 
+  public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of A operand in global memory
+  using FragmentIteratorA1 = FragmentIteratorA1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+ 
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  using ElementC = typename Policy0::Operator::ElementC;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bImplicitGemmMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(
+    IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+    int group_start_A0 = 0, int group_start_B0 = 0) {
+
+    iterator_A0.set_iteration_index(group_start_A0);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+
+      if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+                dst_ptr, iterator_A0.get(), iterator_A0.valid());
+
+        ++iterator_A0;
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0);
+
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+                dst_ptr, iterator_B0.get(), iterator_B0.valid());
+
+        ++iterator_B0;
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+    IteratorB1 &iterator_B1,
+    int group_start_B1 = 0) {
+
+    iterator_B1.set_iteration_index(group_start_B1);
+
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                dst_ptr, iterator_B1.get(), iterator_B1.valid());
+
+        ++iterator_B1;
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over B operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0,
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA0::AccessType *>(
+            this->smem_iterator_A0_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA0::Element>::value *
+            IteratorA0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+          dst_ptr, iterator_A0.get(), iterator_A0.valid());
+
+        ++iterator_A0;
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB0::AccessType *>(
+              this->smem_iterator_B0_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB0::Element>::value *
+            IteratorB0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+            dst_ptr, iterator_B0.get(), iterator_B0.valid());
+
+        ++iterator_B0;
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.advance();
+      iterator_B0.advance();
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance_0(iterator_A0, iterator_B0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+    
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A0, group_start_iteration_B0;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0) {
+          group_start_iteration_A0 = 0;
+          group_start_iteration_B0 = 0;
+        } else {
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+        }
+
+        copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+        warp_mma0(
+                accum0, 
+                warp_transformed_frag_A0[warp_mma_k % 2],
+                warp_transformed_frag_B0[warp_mma_k % 2], 
+                accum0
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.advance();
+          iterator_B0.advance();
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+
+    // 2nd Implicit Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB1::AccessType *>(
+              this->smem_iterator_B1_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+            dst_ptr, iterator_B1.get(), iterator_B1.valid());
+
+        ++iterator_B1;
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.advance();
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    ++this->warp_tile_iterator_B1_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance_1(iterator_B1);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1); 
+            gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        
+        warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_B1;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
+          group_start_iteration_B1 = 0;
+        } else {
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+        }
+
+        copy_tiles_and_advance_1(iterator_B1,
+                               group_start_iteration_B1);
+
+        warp_mma1(
+                accum, 
+                warp_transformed_frag_A1[warp_mma_k % 2],
+                warp_transformed_frag_B1[warp_mma_k % 2], 
+                accum
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.advance();
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
@ -0,0 +1,483 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator) 
+  typename FragmentIteratorA1_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy1_,
+  /// Transformation applied to A operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element, 
+    typename IteratorA0_::Element, 
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element, 
+    typename IteratorB0_::Element, 
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element, 
+    typename IteratorB1_::Element, 
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bImplicitGemmPipelined : public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy0 describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy1 describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+ 
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bImplicitGemmPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                           ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                 ///< destination accumulator tile
+    IteratorA0 iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                            ///< iterator over B0 operand in global memory
+    IteratorB1 iterator_B1,                            ///< iterator over B1 operand in global memory
+    FragmentC0 const &src_accum,                       ///< source accumulator tile
+    OutputOp output_op_0,                               ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),           ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {         ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+    
+          ++iterator_A;
+          ++iterator_B0;
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                 warp_frag_B0[warp_mma_k % 2], accum0);
+
+      }
+    }
+
+
+    //2nd Implicit Gemm
+    
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentB1 tb_frag_B1;
+
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B1.load(tb_frag_B1);
+
+
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+    
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+    
+          ++iterator_B1;
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                 warp_frag_B1[warp_mma_k % 2], accum);
+
+      }
+    }
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -635,40 +635,9 @@ public:
         ++stage, --gemm_k_iterations_1) {

      if (gemm_k_iterations_1 == 0) {
-//        iterator_A1.clear_mask();
        iterator_B1.clear_mask();
      }

-#if 0
-      iterator_A1.set_iteration_index(0);
-      this->smem_iterator_A1_.set_iteration_index(0);
-
-      // LDGSTS for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsA1; ++j) {
-        typename IteratorA1::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA1::AccessType *>(
-                this->smem_iterator_A1_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA1::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA1::Element>::value *
-              IteratorA1::ThreadMap::kElementsPerAccess /
-              IteratorA1::kAccessesPerVector / 8;
-
-          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
-              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
-
-          ++iterator_A0;
-        }
-
-        ++this->smem_iterator_A0_;
-      }
-#endif
-
      iterator_B1.set_iteration_index(0);
      this->smem_iterator_B1_.set_iteration_index(0);

@ -696,19 +665,14 @@ public:
      }

      // Move to the next stage
-      //iterator_A1.add_tile_offset({0, 1});
      iterator_B1.add_tile_offset({1, 0});

-      //this->smem_iterator_A1_.add_tile_offset({0, 1});
      this->smem_iterator_B1_.add_tile_offset({1, 0});

      // Defines the boundary of a stage of cp.async.
      cutlass::arch::cp_async_fence();
    }

-    // Perform accumulation in the 'd' output operand
-//    FragmentC0 accum0 = src_accum;
-
    // DEPBAR+SYNC
    cutlass::arch::cp_async_wait<Base::kStages - 2>();
    __syncthreads();
@ -722,7 +686,6 @@ public:

    Operator1 warp_mma1;

-//    this->warp_tile_iterator_A1_.set_kgroup_index(0);
    this->warp_tile_iterator_B1_.set_kgroup_index(0);

    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0);
@ -732,7 +695,6 @@ public:
    ++this->warp_tile_iterator_B1_;

    if (gemm_k_iterations_1 == 0) {
-//      iterator_A1.clear_mask();
      iterator_B1.clear_mask();
    }

@ -762,7 +724,6 @@ public:
        // Load warp-level tiles from shared memory, wrapping to k offset if
        // this is the last group as the case may be.

-//        this->warp_tile_iterator_A1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
        
        warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
@ -777,6 +738,7 @@ public:
                             warp_loaded_frag_A1[warp_mma_k % 2],
                             warp_loaded_frag_B1[warp_mma_k % 2]);

+
        warp_mma1(
          accum, 
          warp_transformed_frag_A1[warp_mma_k % 2],
@ -823,7 +785,7 @@ public:

          if (smem_read_stage_idx == (Base::kStages - 1)) {
            this->warp_tile_iterator_B1_.add_tile_offset(
-                {-Base::kStages * Policy0::kPartitionsK *
+                {-Base::kStages * Policy1::kPartitionsK *
                     Base::kWarpGemmIterations1,
                 0});
            smem_read_stage_idx = 0;
@ -831,7 +793,6 @@ public:
            ++smem_read_stage_idx;
          }

-//          --gemm_k_iterations_1;
          if (gemm_k_iterations_1 == 1) {
            iterator_B1.clear_mask();
          }
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -454,11 +454,11 @@ public:
          this->smem_iterator_B1_.store(tb_frag_B1);

          __syncthreads();
-          ++smem_iterator_B1_;
+          ++this->smem_iterator_B1_;

          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
          if (smem_write_stage_idx == 1) {
-            smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
          }
          else {
            this->warp_tile_iterator_B1_.add_tile_offset(
--- a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -93,7 +93,7 @@ template <
 struct DefaultB2bMma;

 ////////////////////////////////////////////////////////////////////////////////
-/// Specialization for row-major output
+/// Specialization for row-major output with 2-stage pipeline
 template <
    /// Element type for A matrix operand
    typename ElementA,
@ -110,8 +110,6 @@ template <
    /// Element type for internal accumulation
    typename ElementAccumulator,
    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
    typename ArchTag,
    /// Threadblock-level tile size (concept: GemmShape)
    typename ThreadblockShape0,
@ -129,7 +127,7 @@ template <
    typename EpilogueOutputOp>
 struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  OperatorClass, ArchTag, 
+                  arch::OpClassTensorOp, ArchTag, 
                  ThreadblockShape0, ThreadblockShape1,
                  WarpShape0, WarpShape1,
                  InstructionShape, 2, Operator, EpilogueOutputOp, false> {
@ -137,11 +135,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      OperatorClass, 2, Operator>;
+      arch::OpClassTensorOp, 2, Operator>;
  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      OperatorClass, 2, Operator>;
+      arch::OpClassTensorOp, 2, Operator>;

  // Define iterators over tiles from the A operand
  using IteratorA0 =
@ -162,7 +160,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>;
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;

  // Define iterators over tiles from the B operand
  using IteratorB1 =
@ -181,9 +179,120 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;

 };
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, EpilogueOutputOp, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+ 
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA, 
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB, 
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
+
+};
+
+
 ////////////////////////////////////////////////////////////////////////////////

-/// Specialization for column-major-interleaved output
+/// Specialization for column-major-interleaved output with 2-stage pipeline
 template <
    /// Element type for A matrix operand
    typename ElementA,
@ -258,7 +367,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
          MmaCore1::Shape::kK, //kBlocksColumn
          ElementAccumulator, ElementA, AccumulatorLayout, 
-          InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
+          InstructionShape, EpilogueOutputOp>;

  // Define iterators over tiles from the B operand
  using IteratorB1 =
@ -281,7 +390,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,

 ////////////////////////////////////////////////////////////////////////////////

-/// Specialization for column-major-interleaved output
+/// Specialization for column-major-interleaved output with multi-stage
 template <
    /// Element type for A matrix operand
    typename ElementA,
@ -360,7 +469,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
          MmaCore1::Shape::kK, //kBlocksColumn
          ElementAccumulator, ElementA, AccumulatorLayout, 
-          InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
+          InstructionShape, EpilogueOutputOp>;

  // Define iterators over tiles from the B operand
  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
--- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
+++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -113,31 +113,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,

 int run() {

-  // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.0. 
-  //
-  // CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
-    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
-    return -1;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!((props.major * 10 + props.minor) >= 80)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
-              << std::endl;
-
-    // Return 0 so tests are considered passing if run on unsupported platforms.
-    return 0;
-  }
-
  const int length_m = 5120;
  const int length_n = 4096;
  const int length_k = 4096;
@ -216,8 +191,12 @@ int run() {
  // Instantiate CUTLASS kernel depending on templates
  Gemm gemm_op;

+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
  // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
  CUTLASS_CHECK(status);

  // Launch initialized CUTLASS kernel
@ -262,17 +241,36 @@ int run() {
 }

 int main() {
+  
+  bool notSupported = false;
+
  // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
  // in CUDA 11.0. 
  //
  // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }

-    // Returning zero so this test passes when built on older Toolkits. 
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-  else {
-    return run();
-  }
+
+  return run();
 }
--- a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
+++ b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -71,7 +71,7 @@ using SmArch = cutlass::arch::Sm80;

 // This code section describes the tile size a thread block will compute
 using ShapeMMAThreadBlock =
-    cutlass::gemm::GemmShape<256, 128, 256>;  // <- threadblock tile M = 128, N = 128, K = 256
+    cutlass::gemm::GemmShape<128, 128, 256>;  // <- threadblock tile M = 128, N = 128, K = 256
 // This code section describes tile size a warp will compute
 using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 256>;  // <- warp tile M = 64, N = 64, K = 256
 // This code section describes the size of MMA op
@ -123,31 +123,6 @@ constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;

 int run() {

-  // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.1. 
-  //
-  // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) {
-    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl;
-    return -1;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!((props.major * 10 + props.minor) >= 80)) {
-    std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
-              << std::endl;
-
-    // Return 0 so tests are considered passing if run on unsupported platforms.
-    return 0;
-  }
-
  const int length_m = 512;
  const int length_n = 512;
  const int length_k = 1024;
@ -248,8 +223,12 @@ int run() {
  // Instantiate CUTLASS kernel depending on templates
  Gemm gemm_op;

+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
  // Initialize CUTLASS kernel with arguments and workspace pointer
-  cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+  status = gemm_op.initialize(arguments, workspace.get());
  CUTLASS_CHECK(status);

  // Launch initialized CUTLASS kernel
@ -295,17 +274,37 @@ int run() {
 }

 int main() {
+  
+  bool notSupported = false;
+
  // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available
  // in CUDA 11.1. 
  //
  // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples.
+  
  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) {
    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl;
+    notSupported = true;
+  }

-    // Returning zero so this test passes when built on older Toolkits. 
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-  else {
-    return run();
-  }
+
+  return run();
 }
--- a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  16_ampere_tensorop_conv2dfprop
+  ampere_tensorop_conv2dfprop.cu
+  )
+
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@ -0,0 +1,760 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+This example shows how to run convolution kernels using functions and data structures
+provided by CUTLASS using tensor cores; which we run on a NVIDIA Ampere GPU.
+
+Writing a single high performance convolution kernel is hard but do-able. Whereas writing
+high performance kernels at scale which works for multiple problem sizes with good abstractions is
+really hard. CUTLASS solves this problem by providing simplified abstractions to compose
+multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
+of GPU easily.
+
+CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
+and thread-block level, they compute on their own tile-size with higher level of tile sizes being
+composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
+to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
+threadblock-tile (tile size computed by a threadblock).
+
+In thie example, we split variable initialization into
+1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
+can view them (logical to physical mapping)
+2. Setting up computation properties : describes how the above set tensors will be used to compute
+output of convolution.
+
+First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
+with alpha, beta as the equation for convolution is C = alpha * Conv2dFprop(A, B) + beta * C. In CUTLASS,
+the kernels first compute Conv2dFprop(A, B) and leave the rest of the computation to end of the kernel as
+alpha * X + beta * C is a simple element-wise operation on X (Conv2dFprop(A, B)) and C. We call this as 
+epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to 
+ElementComputeEpilogue = float. We use the data type for elements in input tensor A and B as 
+cutlass::half_t. We convey this to CUTLASS kernel by initializing template variables ElementAccumulator (float),
+ElementComputeEpilogue (float), ElementInputA (cutlass::half_t), ElementInputB (cutlass::half_t),
+ElementOutput (float). Communicating just the data type is not enough. As the data is laid out 
+linearly in memory, we have to convey the layout of tensors. We do that by initializing template
+variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
+rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
+variable EpilogueOp, which takes the data type of output ElementOutput (float), the number of
+elements per vector memory access (8), data type of accumulator (float) and data type of
+computation of linear combination (alpha * X + beta * C).
+
+Now that we setup the properties of data, we have to setup properties of computation.
+
+Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
+64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
+internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
+data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
+from understanding and coding complicated hardware optimizations which can easily go wrong.
+
+CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
+constitute the whole process of loading input data from global memory to shared memory, loading data
+from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
+sequence shows a typical mma multistage pipeline.
+(see include/cutlass/conv/threadblock/implicit_gemm_multistage.h)
+
+tensor in global memory --cp_async--> tile in shared memory --smem loads--> registers 
+--mma--> registers --global stores--> output to global memory
+
+NVIDIA Ampere uses `cp_async` to build multistage software pipeline to better hide latencies.
+
+
+There are few more template variables initialized such as, which threadblock tile of output matrix
+is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
+
+These are all put together to create a template variable which describes CUTLASS Implicit GEMM
+kernel using cutlass::conv::device::ImplicitGemm template.
+
+The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
+in the way of learning CUTLASS.
+
+Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
+kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
+R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
+important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
+memory required by the kernel we instantiated. If yes, we create it and pass it along with other
+arguments created to intialize CUTLASS kernel then, the kernel is launched.
+
+In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
+compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = float;                  // Data type of accumulator
+using ElementComputeEpilogue = float;              // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
+using ElementOutput = float;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+// This code section describe iterator algorithm selected is Analytic or Optimized
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kAnalytic;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                     // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  IteratorAlgorithm
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool benchmark;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32),
+    filter_size(32, 3, 3, 32),
+    padding(1, 1, 1, 1),
+    conv_stride(1, 1),
+    dilation(1, 1),
+    reference_check(false),
+    measure_performance(true),
+    iterations(20),
+    save_workspace(false),
+    alpha(1),
+    beta(0),
+    benchmark(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) ||
+      (padding.w() != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor4DCoord input_size,
+    cutlass::Tensor4DCoord filter_size) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    if (cmd.check_cmd_line_flag("benchmark")) {
+      benchmark = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    filter_size.c() = input_size.c(); 
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = {1, 1, 1, 1};
+    }
+    else {
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = {0, 0, 0, 0};
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "16_ampere_tensorop_conv2dfprop example\n\n"
+      << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
+      << "  forward convolution on tensors of layout NHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n <int>            Input tensor extent N\n"
+      << "  --h <int>            Input tensor extent H\n"
+      << "  --w <int>            Input tensor extent W\n"
+      << "  --c <int>            Input tensor extent C\n"
+      << "  --k <int>            Filter extent K\n"
+      << "  --r <int>            Filter extent R\n"
+      << "  --s <int>            Filter extent S\n\n"
+      << "  --alpha <float>      Epilogue scalar alpha\n"
+      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check on the host is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
+      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --save-workspace     If set, workspace is written to a text file.\n"
+      << "  --tag <string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+      input_size.n(),
+      (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+      (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out 
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(7),
+      ElementInputB(-8),
+      0);
+
+  // Fill tensor C on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_c.host_view());
+
+  // Fill tensor C for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_c.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_ref_c.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(      
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices
+  );
+
+  // Construct ImplicitGemm::Argument structure with conv2d 
+  // problem size, data pointers, and epilogue values
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {options.alpha, options.beta},
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  ImplicitGemm implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+  
+  if (options.reference_check) {
+    std::cout << "Verification on host...\n";
+
+    // Compute with reference implementation
+    cutlass::reference::host::Conv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_a.host_ref(),
+      tensor_b.host_ref(),
+      tensor_c.host_ref(),
+      tensor_ref_c.host_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_c.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_c.host_view(),
+      tensor_ref_c.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    }
+    else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  }
+  else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+
+    std::stringstream ss;
+
+    ss << "16_ampere_workspace_conv2dfprop_"
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << "_"
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace 
+      << "Input = \n" << tensor_a.host_view() << "\n\n"
+      << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+  
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.benchmark) {
+    // Benchmark several layers
+
+    int batch_sizes[] = {1, 32, 64, 128, 256, 512};
+
+    struct Benchmark {
+      int h, w, c, k, r, s;
+    } layers[] = {
+      {56,  56,   64,   256, 1, 1},
+      {56,  56,   64,    64, 1, 1},
+      {56,  56,   64,    64, 3, 3},
+      {56,  56,  256,    64, 1, 1},
+      {56,  56,  256,   512, 1, 1},
+      {56,  56,  256,   128, 1, 1},
+      {28,  28,  128,   128, 3, 3},
+      {28,  28,  128,   512, 1, 1},
+      {28,  28,  512,   128, 1, 1},
+      {28,  28,  512,  1024, 1, 1},
+      {28,  28,  512,   256, 1, 1},
+      {14,  14,  256,   256, 3, 3},
+      {14,  14,  256,  1024, 1, 1},
+      {14,  14,  1024,  256, 1, 1},
+      {14,  14,  1024, 2048, 1, 1},
+      {14,  14,  1024,  512, 1, 1},
+      {7,    7,   512,  512, 3, 3},
+    };
+
+    Result::print_header(std::cout, options) << std::endl;
+
+    int idx = 1;
+
+    for (auto const &layer : layers) {
+      for (auto N : batch_sizes) {
+
+        options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
+
+        Result result = profile_convolution(options);
+        result.print(std::cout, idx, options) << std::endl;
+      }
+
+      ++idx;
+    }
+  }
+  else {
+
+    // Execute one problem size
+    if (!options.valid()) {
+      std::cerr << "Invalid problem." << std::endl;
+      return -1;
+    }
+
+    Result result = profile_convolution(options);
+
+    Result::print_header(std::cout, options) << std::endl;
+    result.print(std::cout, 1, options) << std::endl;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
--- a/examples/17_fprop_per_channel_bias/CMakeLists.txt
+++ b/examples/17_fprop_per_channel_bias/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  17_fprop_per_channel_bias 
+  fprop_per_channel_bias.cu
+  )
+
--- a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
+++ b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
@ -0,0 +1,300 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+The convolution version of 12_gemm_bias_relu.  Similarly, we put bias vector in Operand C and the
+rest is the same as normal convolution.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = float;                  // Data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
+using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
+using ElementOutput = float;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 4;
+
+// This code section describe iterator algorithm selected is Analytic or Optimized
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
+    ElementOutput,                                        // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,     // The number of elements per vectorized.
+                                                          // memory access. This becomes the vector width of
+                                                          // math instructions in the epilogue too.
+    ElementAccumulator,                                   // Data type of accumulator
+    ElementComputeEpilogue,                               // Data type for alpha in linear combination
+    cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // alpha X C + per channel bias
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  IteratorAlgorithm
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int run() {
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(      
+    {1, 7, 7, 512},                               // activation 
+    {512, 3, 3, 512},                             // filter
+    {1, 1, 1, 1},                                 // padding
+    {1, 1},                                       // striding
+    {1, 1},                                       // dilation
+    cutlass::conv::Mode::kCrossCorrelation,       // mode (convolution or cross-correlation)
+    1                                             // split-k slices
+  );
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(problem_size.activation_extent());
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(problem_size.filter_extent());
+
+  // Create tensor C with dimensions 1x1x1xk which is the bias vector
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c_bias({1, 1, 1, problem_size.K});
+
+  // Create tensor D used to store output from CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(problem_size.output_extent());
+  // Create matrix D with dimensions M x N used to store output from reference
+  // kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(problem_size.output_extent());
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      0);  // <- Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(4),
+      ElementInputB(-4),
+      0);  // <- Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c_bias.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c_bias.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),              // <- reference to tensor A on device
+    tensor_b.device_ref(),              // <- reference to tensor B on device
+    // tensor C  is treated as the bias vector. We can enable the CONV
+    // to project away the N, H, W dimension by setting the stride to zero.
+    {tensor_c_bias.device_data(), LayoutOutput::Stride(0)},
+    tensor_d.device_ref(),              // <- reference to tensor D on device
+    {alpha} };                    
+
+  // Instantiate CUTLASS kernel depending on templates
+  ImplicitGemm implicit_gemm_op;
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check the problem size is supported or not
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // Launch initialized CUTLASS kernel
+  status = implicit_gemm_op();
+
+  CUTLASS_CHECK(status);
+
+  //
+  // Create instantiation for device reference conv kernel
+  //
+
+  // Launch device reference to compute strictly the product A * B
+  cutlass::reference::device::Conv2d<
+      ElementInputA, 
+      LayoutInputA, 
+      ElementInputB, 
+      LayoutInputB, 
+      ElementOutput,
+      LayoutOutput, 
+      ElementComputeEpilogue, 
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>>
+    (
+      cutlass::conv::Operator::kFprop, 
+      problem_size, 
+      tensor_a.device_ref(),
+      tensor_b.device_ref(), 
+      tensor_c_bias.device_ref(), 
+      tensor_ref_d.device_ref(),
+      alpha, 0
+    );
+
+  // Wait for kernels to finish
+  cudaDeviceSynchronize();
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  tensor_d.sync_host();
+  tensor_ref_d.sync_host();
+
+  // Compute bias + relu in host code
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int p = 0; p < problem_size.P; ++p) {
+      for (int q = 0; q < problem_size.Q; ++q) {
+        for (int k = 0; k < problem_size.K; ++k) {
+          
+          tensor_ref_d.at({n, p, q, k}) =
+              std::max(ElementOutput(0),
+                       ElementOutput(tensor_ref_d.at({n, p, q, k}) +
+                                     tensor_c_bias.at({0, 0, 0, k})));
+        }
+      }
+    }
+  }
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
+                                                       tensor_ref_d.host_view())
+                    ? "Passed"
+                    : "Failed")
+            << std::endl;
+
+  CUTLASS_CHECK(status);
+  return 0;
+}
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  return run();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@ -22,15 +22,20 @@

 set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)

+add_custom_target(cutlass_examples)
+add_custom_target(test_examples)
+
 function(cutlass_example_add_executable NAME)

  set(options)
  set(oneValueArgs)
-  set(multiValueArgs)
+  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})

+  add_dependencies(cutlass_examples ${NAME})
+
  target_link_libraries(
    ${NAME}
    PRIVATE
@ -44,19 +49,21 @@ function(cutlass_example_add_executable NAME)
    ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
    )

-  add_custom_target(
-    test_${NAME}
-    COMMAND
-    ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${NAME}>
-  DEPENDS
-    ${NAME}
+  install(
+    TARGETS ${NAME}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    )
+
+  cutlass_add_executable_tests(
+    test_examples_${NAME} ${NAME}
+    DEPENDS ${__DEPENDS}
+    DEPENDEES test_examples ${__DEPENDEES}
+    TEST_COMMAND_OPTIONS ${__TEST_COMMAND_OPTIONS}
+    DISABLE_EXECUTABLE_INSTALL_RULE
    )

 endfunction()

-add_custom_target(cutlass_examples)
-add_custom_target(test_examples)
-
 foreach(EXAMPLE
  00_basic_gemm
  01_cutlass_utilities
@ -67,16 +74,17 @@ foreach(EXAMPLE
  06_splitK_gemm
  07_volta_tensorop_gemm
  08_turing_tensorop_gemm
+  09_turing_tensorop_conv2dfprop
  10_planar_complex
  11_planar_complex_array
  12_gemm_bias_relu
-  13_fused_two_gemms
+  13_two_tensor_op_fusion  
  14_ampere_tf32_tensorop_gemm
  15_ampere_sparse_tensorop_gemm
-)
+  16_ampere_tensorop_conv2dfprop
+  17_fprop_per_channel_bias
+  )

  add_subdirectory(${EXAMPLE})
-  add_dependencies(cutlass_examples ${EXAMPLE})
-  add_dependencies(test_examples test_${EXAMPLE})

 endforeach()
--- a/include/cutlass/aligned_buffer.h
+++ b/include/cutlass/aligned_buffer.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/cache_operation.h
+++ b/include/cutlass/arch/cache_operation.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -187,10 +187,10 @@ struct global_load<AccessType,
 /////////////////////////////////////////////////////////////////////////////////////////////////

 template <
-    /// Fragment type to store loaded data
+    /// Fragment type to store data
    typename AccessType,
-    /// The bytes of loading
-    int LoadBytes
+    /// The bytes of storing
+    int StoreBytes
    >
 struct global_store;

@ -294,7 +294,6 @@ struct global_store<AccessType, 1> {

 /////////////////////////////////////////////////////////////////////////////////////////////////

-
 } // namespace arch
 } // namespace cutlass

--- a/include/cutlass/arch/memory_sm75.h
+++ b/include/cutlass/arch/memory_sm75.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -73,7 +73,7 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 #endif
 */

-#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
+#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
  extern "C" {
  //
  // This NVVM intrinsic is subject to change in future versions of CUDA.
@ -91,7 +91,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {

 // We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
 // the previous internal intrinsics if they are available.
-#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
+#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
  //
  // This NVVM intrinsic converts an address in shared memory to a plain
  // unsigned integer. This is necessary to pass to shared memory instructions
@ -104,7 +104,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
  /// CUTLASS helper to get SMEM pointer
  return static_cast<unsigned>(__cvta_generic_to_shared(ptr));

-#elif (defined(__CUDA_ARCH__) &&  __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
+#elif (! defined (__clang__) && defined(__CUDA_ARCH__) &&  __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)

  return __nvvm_get_smem_pointer(ptr);

@ -120,7 +120,10 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {

 #else

-  return 0;
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+
 #endif
 }
  
@ -146,7 +149,9 @@ inline __device__ void ldsm<layout::RowMajor, 1>(

  #else

-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();

  #endif
 }
@ -168,7 +173,9 @@ inline __device__ void ldsm<layout::RowMajor, 2>(

  #else

-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();

  #endif
 }
@ -190,7 +197,9 @@ inline __device__ void ldsm<layout::RowMajor, 4>(

  #else

-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();

  #endif
 }
@ -216,7 +225,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 1>(

  #else

-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();

  #endif
 }
@ -238,7 +249,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 2>(

  #else

-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();

  #endif
 }
@ -260,7 +273,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 4>(

  #else

-    assert(0);
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();

  #endif
 }
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -74,11 +74,16 @@ template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct cp_async<SizeInBytes, CacheOperation::Always> {
+
  /// Copy
  CUTLASS_DEVICE
  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
    #if CUDA_CP_ASYNC_ACTIVATED
-    
+ 
+      // Make sure the size is supported.
+      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+   
      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);

      asm volatile(
@ -104,11 +109,16 @@ template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
+
  /// Copy with zero fill
  CUTLASS_DEVICE
  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
    #if CUDA_CP_ASYNC_ACTIVATED
-    
+
+      // Make sure the size is supported.
+      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+   
      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
      int src_in_bytes = (pred_guard ? SizeInBytes : 0);

@ -138,12 +148,13 @@ template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct cp_async<SizeInBytes, CacheOperation::Global> {
+
  /// Copy
  CUTLASS_DEVICE
  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
    #if CUDA_CP_ASYNC_ACTIVATED
    
-      static_assert(SizeInBytes == 16, 
+      static_assert(SizeInBytes == 16,
        "cp.async only supports CacheOperation::Global when access size is 16B.");

      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
@ -171,12 +182,13 @@ template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
+
  /// Copy with zero fill
  CUTLASS_DEVICE
  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
    #if CUDA_CP_ASYNC_ACTIVATED

-      static_assert(SizeInBytes == 16, 
+      static_assert(SizeInBytes == 16,
        "cp.async only supports CacheOperation::Global when access size is 16B.");

      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
@ -235,4 +247,3 @@ CUTLASS_DEVICE void cp_async_wait<0>() {
 }  // namespace cutlass

 /////////////////////////////////////////////////////////////////////////////////////////////////
-
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -201,5 +201,5 @@ struct SparseMma;
 #include "cutlass/arch/mma_sm70.h" 
 #include "cutlass/arch/mma_sm75.h" 
 #include "cutlass/arch/mma_sm80.h"
-#include "cutlass/arch/sp_mma_sm80.h"
+#include "cutlass/arch/mma_sparse_sm80.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/arch/mma_sm50.h
+++ b/include/cutlass/arch/mma_sm50.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/mma_sm70.h
+++ b/include/cutlass/arch/mma_sm70.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -365,7 +365,7 @@ struct Mma<
  }
 };

-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
  gemm::GemmShape<8, 8, 16>,
@ -599,7 +599,7 @@ struct Mma<
  }
 };

-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
  gemm::GemmShape<8,8,16>,
--- a/include/cutlass/arch/mma_sm80.h
+++ b/include/cutlass/arch/mma_sm80.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -112,7 +112,13 @@ struct Mma<
  );

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
  }
 };
@ -178,7 +184,13 @@ struct Mma<
  );

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
  }
 };
@ -230,7 +242,13 @@ struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
  }
 };
@ -291,7 +309,13 @@ struct Mma<
  );

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
  }
 };
@ -352,7 +376,13 @@ struct Mma<
          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
  }
 };
@ -413,7 +443,13 @@ struct Mma<
          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
 #endif
  }
 };
@ -472,7 +508,13 @@ struct Mma<
      : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));

 #else
-    assert(0);
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+    
 #endif
  }
 };
--- a/include/cutlass/arch/mma_sparse_sm80.h
+++ b/include/cutlass/arch/mma_sparse_sm80.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -29,7 +29,15 @@

 #pragma once

-#include "mma_sm80.h"
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"

 /////////////////////////////////////////////////////////////////////////////////////////////////

--- a/include/cutlass/arch/simd.h
+++ b/include/cutlass/arch/simd.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/simd_sm60.h
+++ b/include/cutlass/arch/simd_sm60.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/simd_sm61.h
+++ b/include/cutlass/arch/simd_sm61.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -52,7 +52,7 @@
 #endif
 #endif

-#endif //__clang__
+#endif //!defined(__clang__)

 #if defined(CUTLASS_ARCH_WMMA_ENABLED)

@ -82,6 +82,12 @@ struct CutlassToWmmaDataType<cutlass::half_t> {
  using Type = __half;
 };

+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct CutlassToWmmaDataType<cutlass::bfloat16_t> {
+  using Type = __nv_bfloat16;
+};
+#endif

 /// Statically maps int8_t => char
 template<>
@ -158,6 +164,14 @@ template<>
 struct WmmaToCutlassDataType<__half> {
  using Type = cutlass::half_t;
 };
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct WmmaToCutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/arch/wmma_sm70.h
+++ b/include/cutlass/arch/wmma_sm70.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/wmma_sm72.h
+++ b/include/cutlass/arch/wmma_sm72.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/arch/wmma_sm75.h
+++ b/include/cutlass/arch/wmma_sm75.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/array_planar_complex.h
+++ b/include/cutlass/array_planar_complex.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
@ -45,9 +45,6 @@ template <
 class Array<T, N, false> {
 public:

-  static_assert(sizeof_bits<T>::value * N >= 8,
-    "Array<> specialized for sub-byte types assume the actual stored element size is 1 byte");
-
  static int const kSizeBits = sizeof_bits<T>::value * N;

  /// Storage type
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Haicheng Wu	0f1056390d	Create PUBLICATIONS.md (#189 )	2021-03-03 11:17:40 -08:00
Haicheng Wu	34a42e5620	Update generator.py (#192 )	2021-03-02 12:21:48 -08:00
Dustyn Blasig	8f09b82b12	Merge pull request #187 from NVIDIA/cutlass_2.5 CUTLASS 2.5.0	2021-02-26 23:56:04 -06:00
Andrew Kerr	200a5a5146	Enabled reduction unit tests.	2021-02-26 15:46:57 -05:00
Andrew Kerr	746b7b3247	Enabled tensor reduction kernels.	2021-02-26 15:32:19 -05:00
Andrew Kerr	abdf16a4d9	Updated release notes.	2021-02-26 13:55:04 -05:00
Andrew Kerr	0e13748649	CUTLASS 2.5	2021-02-26 09:58:26 -05:00
Manish Gupta	ccb697bac7	cutlass 2.4 documentation only update	2020-11-23 06:59:45 -06:00
Yang Wang	e6bcdc60cf	fix broken links (#148 )	2020-11-19 21:46:54 -08:00
Manish Gupta	6615010cd0	CUTLASS 2.4 (Implicit GEMM convolution) (#147 ) CUTLASS 2.4 (Implicit GEMM Convolution) Co-authored-by: Manish Gupta <manigupta@nvidia.com>, Haicheng Wu <haichengw@nvidia.com>, Dustyn Blasig <dblasig@nvidia.com>, Andrew Kerr <akerr@nvidia.com>	2020-11-19 21:25:25 -08:00