Updates for 3.1 (#932)

2023-04-29 06:34:27 -07:00
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -59,6 +59,8 @@ endif()

 find_package(Doxygen QUIET)

+################################################################################
+
 #
 # CUTLASS 3.x requires C++17
 #
@ -80,6 +82,10 @@ endif()

 message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")

+set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
+
+################################################################################
 set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")

 if(CUTLASS_ENABLE_HEADERS_ONLY)
@ -112,6 +118,8 @@ if (CUTLASS_ENABLE_TESTS)
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
 endif()

+################################################################################
+
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
 if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
@ -197,15 +205,16 @@ set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
 set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
 set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")

+################################################################################
 #
 # CUTLASS generator cmake configuration
 #
+
 set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
-set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
+set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
 set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")

-# Test Levels L0, L1, L2
-set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+################################################################################

 set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")

@ -225,6 +234,8 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
 endif()

+################################################################################
+
 #
 # CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
 #
@ -650,14 +661,16 @@ function(cutlass_add_executable_tests NAME TARGET)
 # DEPENDS: A list of targets or files on which this test is dependent.
 # DEPENDEES: A list of targets which should depend on this test.
 # TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
-#   to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of 
+#   to pass to the test executable. A unique test is generated for each set of 
 #   options given. If this option is not used, a single test with no arguments is generated.
+# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for 
+#   generating the full variable name to be referenced.
 # RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
 #   test results to speed up test runtime.
 # 

  set(options DISABLE_EXECUTABLE_INSTALL_RULE)
-  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE)
+  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@ -701,7 +714,6 @@ function(cutlass_add_executable_tests NAME TARGET)
  endif()

  list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
-  set(CMD_IDX 0)

  if (CMD_COUNT GREATER 1)
    add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
@ -710,12 +722,12 @@ function(cutlass_add_executable_tests NAME TARGET)
    endforeach()
  endif()

-  foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
+  foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)

    if (CMD_COUNT GREATER 1)
-      set(TEST_NAME ${NAME}_${CMD_IDX})
+      string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME)
    else()
-      set(TEST_NAME ${NAME})
+      string(TOLOWER "${NAME}" TEST_NAME)
    endif()

    # The following rigmarole is needed to deal with spaces and possible quotes in 
@ -724,14 +736,14 @@ function(cutlass_add_executable_tests NAME TARGET)
    # preserves any quotes. Note, they have to be in this order for it to work for 
    # all the use cases below.

-    set(CMD_OPTIONS ${${CMD_OPTIONS}})
-    list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
-    separate_arguments(CMD_OPTIONS)
-   
+    set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
+    list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
+    separate_arguments(TEST_COMMAND_OPTIONS)
+
    add_custom_target(
      ${TEST_NAME}
      COMMAND
-      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
      DEPENDS
      ${TARGET}
      )
@ -746,7 +758,7 @@ function(cutlass_add_executable_tests NAME TARGET)

    add_test(
      NAME c${TEST_NAME}
-      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
      )

    set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
@ -756,18 +768,21 @@ function(cutlass_add_executable_tests NAME TARGET)
      # To run the tests from an install package with tests enabled, we need to generate test files
      # that don't rely on the current directory structure in build.  

+      set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NAME})
+      file(MAKE_DIRECTORY ${TEST_GEN_DIR})
+
      set(TEST_NAME c${TEST_NAME})
      set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
      set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
-      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
+      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)

      file(GENERATE 
-        OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
-        INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
+        OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
+        INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
        )
  
      install(
-        FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
+        FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake"
        DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
        )
  
@ -775,8 +790,6 @@ function(cutlass_add_executable_tests NAME TARGET)
  
    endif()

-    math(EXPR CMD_IDX "${CMD_IDX} + 1")
-
  endforeach()

 endfunction()
@ -787,6 +800,7 @@ if (CUTLASS_ENABLE_TOOLS)
    add_dependencies(test_all test_profiler)
  endif()
 endif()
+
 if (CUTLASS_ENABLE_EXAMPLES)
  add_subdirectory(examples)
  add_dependencies(test_all test_examples)
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@ -31,5 +31,6 @@
 cutlass_example_add_executable(
  08_turing_tensorop_gemm
  turing_tensorop_gemm.cu
+  DISABLE_TESTS ON
  )

--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@ -31,5 +31,6 @@
 cutlass_example_add_executable(
  12_gemm_bias_relu
  gemm_bias_relu.cu
+  DISABLE_TESTS ON
  )

--- a/examples/31_basic_syrk/basic_syrk.cu
+++ b/examples/31_basic_syrk/basic_syrk.cu
@ -34,7 +34,7 @@
  matrix multiply kernel to verify its correctness.

  The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
-  the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
+  the symmetric rank-k update (SYRK) using double-precision floating-point arithmetic and assumes
  all matrices have column-major layout.

  The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
--- a/examples/32_basic_trmm/basic_trmm.cu
+++ b/examples/32_basic_trmm/basic_trmm.cu
@ -34,7 +34,7 @@
  matrix multiply kernel to verify its correctness.

  The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes
-  the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
+  the triangular matrix product (TRMM) using double-precision floating-point arithmetic and assumes
  all matrices have column-major layout.

  The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
@ -495,7 +495,7 @@ int main(int argc, const char **argv)
  options.tensor_d.resize(options.problem_size.mn());       // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel
  options.tensor_ref_d.resize(options.problem_size.mn());   // <- Create matrix D with dimensions M x N used to store output from reference kernel

-  // Fill matrix A on host with uniform-random data [4, -4]
+  // Fill matrix A on host with uniform-random data [2, -2]
  cutlass::reference::host::TensorFillRandomUniform(
      options.tensor_a.host_view(),
      1,
@ -503,7 +503,7 @@ int main(int argc, const char **argv)
      ElementA(-2),
      0);

-  // Fill matrix B on host with uniform-random data [4, -4]
+  // Fill matrix B on host with uniform-random data [2, -2]
  cutlass::reference::host::TensorFillRandomUniform(
      options.tensor_b.host_view(),
      1,
@ -511,7 +511,7 @@ int main(int argc, const char **argv)
      ElementB(-2),
      0);

-  // Fill matrix C on host with uniform-random data [4, -4]
+  // Fill matrix C on host with uniform-random data [2, -2]
  cutlass::reference::host::TensorFillRandomUniform(
      options.tensor_c.host_view(),
      1,
--- a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
+++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
@ -84,9 +84,10 @@
    therefore letting the builder pick the collective specialization.

    CUTLASS builders make an attempt to pick the best schedule when `Auto` is provided such that the
-    assembled collctives have the best performance, but this is not a guarantee. A user relying on `Auto`
+    assembled collectives have the best performance, but this is not a guarantee. A user relying on `Auto`
    may get a free performance upgrade with newer CUTLASS releases in case we can provide more optimized
-    implementations that the builder can transparently assemble for `Auto`.
+    implementations that the builder can transparently assemble for `Auto`. But a user should not rely on 
+    `Auto` if they require a specific scheduling policy and/or stage count to be used.

    If a user decides to let the builders pick the collective specialization via `Auto` schedules,
    they must be used for both mainloop and epilogue alike to ensure compatibility between the
@ -99,11 +100,6 @@
    in this manner remains the primary API for using CUTLASS 3 kernels. `CollectiveBuilder`s are
    simply meant to be a convenience interface.

-    Note also that, while the selections made by CollectiveBuilder attempt to maximize performance, this is not
-    a guarantee. Furthermore, the behavior of the CollectiveBuilder when `Auto` parameters are provided is subject
-    to change in future CUTLASS releases -- do not rely on `Auto` if you require a specific scheduling policy and/or
-    stage count to be used.
-
    Details of this example
    -----------------------
    This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
--- a/include/cute/arch/cluster_sm90.hpp
+++ b/include/cute/arch/cluster_sm90.hpp
@ -227,4 +227,17 @@ elect_one_leader_sync()
 #endif
 }

+// Store value to remote shared memory in the cluster
+CUTE_DEVICE
+void
+store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
+  uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
+  asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
+               : : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
+#endif
+}
+
 } // end namespace cute
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@ -35,6 +35,7 @@
 #pragma once

 #include "cutlass/cutlass.h"
+#include "cutlass/arch/cache_operation.h"

 namespace cutlass {
 namespace arch {
@ -45,7 +46,9 @@ template <
    /// Fragment type to store loaded data
    typename AccessType,
    /// The bytes of loading
-    int LoadBytes
+    int LoadBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always
    >
 struct global_load;

@ -59,7 +62,7 @@ struct global_load;

 #if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
     (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
-    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) 
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
  #define CUTLASS_ENABLE_L2_PREFETCH 1
 #else
  #define CUTLASS_ENABLE_L2_PREFETCH 0
@ -71,7 +74,8 @@ struct global_load;
 // keep the initializing code before ld.global
 template <typename AccessType>
 struct global_load<AccessType,
-                   32 
+                   32,
+                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -107,7 +111,40 @@ struct global_load<AccessType,

 template <typename AccessType>
 struct global_load<AccessType,
-                   16
+                   32,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   16,
+                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -133,7 +170,31 @@ struct global_load<AccessType,

 template <typename AccessType>
 struct global_load<AccessType,
-                   8
+                   16,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %5, 0;\n"
+        "  mov.b32 %0, %6;\n"
+        "  mov.b32 %1, %7;\n"
+        "  mov.b32 %2, %8;\n"
+        "  mov.b32 %3, %9;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   8,
+                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -158,7 +219,30 @@ struct global_load<AccessType,

 template <typename AccessType>
 struct global_load<AccessType,
-                   4
+                   8,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint2 &data = reinterpret_cast<uint2 &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %3, 0;\n"
+        "  mov.b32 %0, %4;\n"
+        "  mov.b32 %1, %5;\n"
+        "  @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   4,
+                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -182,7 +266,29 @@ struct global_load<AccessType,

 template <typename AccessType>
 struct global_load<AccessType,
-                   2
+                   4,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  unsigned &data = reinterpret_cast<unsigned &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b32 %0, %3;\n"
+        "  @p ld.global.lu.u32 %0, [%1];\n"
+        "}\n"
+        : "=r"(data)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   2,
+                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -206,7 +312,29 @@ struct global_load<AccessType,

 template <typename AccessType>
 struct global_load<AccessType,
-                   1
+                   2,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint16_t &data = reinterpret_cast<uint16_t &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b16 %0, %3;\n"
+        "  @p ld.global.lu.u16 %0, [%1];\n"
+        "}\n"
+        : "=h"(data)
+        : "l"(ptr), "r"((int)pred_guard), "h"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   1,
+                   CacheOperation::Always
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@ -81,7 +81,6 @@ protected:
  CUTLASS_DEVICE
  static void red_release(int *ptr, int val)
  {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
 #if (__CUDA_ARCH__ >= 700)
    /// SM70 and newer use memory consistency qualifiers

@ -94,7 +93,6 @@ protected:
    __threadfence();
    atomicAdd(ptr, val);
 #endif // (__CUDA_ARCH__ >= 700)
-#endif
  }


@ -104,7 +102,6 @@ public:
  CUTLASS_DEVICE
  static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
  {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;

    if (thread_idx == 0)
@ -115,14 +112,12 @@ public:
    }

    __syncthreads();
-#endif
  }

  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
  CUTLASS_DEVICE
  static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
  {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;

    if (thread_idx == 0)
@ -132,13 +127,11 @@ public:
        while(ld_acquire(flag_ptr) != val) {}
    }
    __syncthreads();
-#endif
  }

  /// Uses thread[0] to wait for the specified count of signals on the given flag counter
  CUTLASS_DEVICE
  static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;

    if (thread_idx == 0)
@ -149,14 +142,12 @@ public:
    }

    __syncthreads();
-#endif
  }

  /// Increment the arrival count for a flag
  CUTLASS_DEVICE
  static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx)
  {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;

    __syncthreads();
@ -165,7 +156,6 @@ public:
    {
      red_release(flag_ptr, 1);
    }
-#endif
  }


@ -173,7 +163,6 @@ public:
  CUTLASS_DEVICE
  static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1)
  {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    int flag_idx = first_flag_idx + thread_idx;
    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;

@ -184,7 +173,6 @@ public:
    if (thread_idx < count) {
      red_release(flag_ptr, 1);
    }
-#endif
  }
 };

--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@ -59,9 +59,7 @@ inline std::ostream &operator<<(std::ostream &out, dim3 d) {

 /// Output operator for CUDA built-in error type
 inline std::ostream &operator<<(std::ostream &out, cudaError_t error) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
  return out << cudaGetErrorString(error);
-#endif
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@ -290,7 +290,7 @@ struct CollectiveBuilder<
    AlignmentD,
    Schedule,
    cute::enable_if_t<cute::is_same_v<Schedule, TmaWarpSpecialized> ||
-                     cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> >> {
+                      cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> >> {
 public:
  // Passing void C disables source load
  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
@ -302,16 +302,33 @@ public:

  using ThreadOp = thread::LinearCombination<
    ElementD, AlignmentD, ElementAccumulator, ElementCompute,
-    thread::ScaleType::Default, FloatRoundStyle::round_to_nearest, ElementC>;
+    ScaleType, FloatRoundStyle::round_to_nearest, ElementC>;

-private:
-  using Impl = detail::TmaBuilderImpl<
-    TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
-    ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>;
+  using GmemStrideTypeC = gemm::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;

-public:
-  using CollectiveOp = typename Impl::CollectiveOp;
+  using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override<
+      ElementD, EpilogueTileType, Schedule>());
+
+  static constexpr int StagesC = 1;
+  static constexpr int StagesD = 2;
+  static constexpr bool DisableReuseSmemC = true;
+  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
+      cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC,StagesD,DisableReuseSmemC>,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // need to pass void to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD,
+      GmemStrideTypeD,
+      ThreadOp,
+      SM90_TMA_LOAD,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, TileShape_MNK>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
+      SM90_TMA_STORE,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>())
+    >;
 };

 // Auto builder
@ -409,7 +426,7 @@ struct CollectiveBuilder<
    AlignmentD,
    Schedule,
    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule> ||
-                     cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {

 public:
  using ThreadOp = thread::LinearCombinationGeneric<
@ -419,10 +436,13 @@ public:
    Schedule::Round>;

 private:
+  static constexpr int StagesC = 1;
+  static constexpr int StagesD = 2;
+  static constexpr bool DisableReuseSmemC = true;
  using Impl = detail::TmaBuilderImpl<
    TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
    ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>;
+    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC,StagesD,DisableReuseSmemC>>;

 public:
  using CollectiveOp = typename Impl::CollectiveOp;
@ -459,7 +479,7 @@ struct CollectiveBuilder<
    AlignmentD,
    Schedule,
    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedBiasElementwiseBase, Schedule> ||
-                     cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {

 public:
  using ThreadOp = thread::LinearCombinationBiasElementwise<
@ -468,10 +488,12 @@ public:
      Schedule::StoreT, typename Schedule::ElementBias>;

 private:
+  static constexpr int StagesC = 1;
+  static constexpr int StagesD = 2;
  using Impl = detail::TmaBuilderImpl<
    TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
    ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<1,2>>;
+    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<StagesC,StagesD>>;

 public:
  using CollectiveOp = typename Impl::CollectiveOp;
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@ -82,7 +82,6 @@ public:
  //
  // Type Aliases
  //
-  // derived types of output thread level operator
  using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,DisableSmemReuseC_>;
  using BlockTileShape = BlockTileShape_;
  using EpilogueTile = EpilogueTile_;
@ -108,7 +107,6 @@ public:
  constexpr static bool iskThreadEpilogueOpWithBias = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
  using AlignmentType = typename uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;

-  static_assert(sizeof(ElementC) == 2, "Only 16b source supported for now");
  static_assert(sizeof(ElementD) == 2, "Only 16b output supported for now");
  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
  static_assert(rank(BlockTileShape{}) == 3, "BlockTileShape must be rank-3: [BLK_M,BLK_N,BLK_K]");
@ -117,17 +115,19 @@ public:
  static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");

 private:
+  using InternalElementC = std::conditional_t<std::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
  constexpr static int StagesC = StagesC_;
  constexpr static int StagesD = StagesD_;
  constexpr static bool is_source_supported = ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default ||
                                              ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+  static_assert((std::is_void_v<ElementC> && not is_source_supported) || (not std::is_void_v<ElementC> && is_source_supported));

  // internal optimization to reuse C shared memory for storing D
-  using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<ElementC>::value>(SmemLayoutAtomC{}));
+  using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<InternalElementC>::value>(SmemLayoutAtomC{}));
  using SmemLayoutAtomBitsD = decltype(downcast<sizeof_bits<ElementD>::value>(SmemLayoutAtomD{}));
  constexpr static bool ReuseSmemC = not DispatchPolicy::DisableSmemReuseC &&
                                     is_source_supported &&
-                                     sizeof(ElementC) == sizeof(ElementD) &&
+                                     sizeof(InternalElementC) == sizeof(ElementD) &&
                                     StrideC{} == StrideD{} &&
                                     cute::is_same_v<SmemLayoutAtomBitsC,SmemLayoutAtomBitsD>;

@ -152,7 +152,7 @@ public:
  using LoadPipeline = cutlass::PipelineTransactionAsync<is_source_supported ? StagesC : 0>;
  using LoadPipelineState = cutlass::PipelineState<is_source_supported ? StagesC : 0>;
  constexpr static uint32_t TmaTransactionBytes =
-    size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(ElementC));
+    size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(InternalElementC));

  // TMA pipeline for storing D
  using StorePipeline = cutlass::PipelineTmaStore<ReuseSmemC ? StagesC : StagesD>;
@ -161,8 +161,8 @@ public:
  struct SharedStorage {
    struct TensorStorage : aligned_struct<128> {
      cute::conditional_t<not is_source_supported,
-        detail::EmptyStorage<ElementC>,
-        array_aligned<ElementC, size(SmemLayoutC{})>> smem_C;
+        detail::EmptyStorage<InternalElementC>,
+        array_aligned<InternalElementC, size(SmemLayoutC{})>> smem_C;
      alignas(128) cute::conditional_t<ReuseSmemC,
        detail::EmptyStorage<ElementD>,
        array_aligned<ElementD, size(SmemLayoutD{})>> smem_D;
@ -187,7 +187,7 @@ public:
  struct Params {
    using TMA_C = decltype(make_tma_copy(
        CopyOpG2S{},
-        make_tensor(static_cast<ElementC const*>(nullptr),
+        make_tensor(static_cast<InternalElementC const*>(nullptr),
            repeat_like(StrideC{}, int32_t(0)), StrideC{}),
        SmemLayoutC{}(_,_,0)));
    using TMA_D = decltype(make_tma_copy(
@ -217,7 +217,7 @@ public:
    auto M = get<0>(problem_shape_MNKL);
    auto N = get<1>(problem_shape_MNKL);
    auto L = get<3>(problem_shape_MNKL);
-    Tensor tensor_c = make_tensor(args.ptr_C, make_layout(make_shape(M,N,L), args.dC));
+    Tensor tensor_c = make_tensor(static_cast<InternalElementC const*>(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
    Tensor tensor_d = make_tensor(args.ptr_D, make_layout(make_shape(M,N,L), args.dD));
    typename Params::TMA_C tma_load_c = make_tma_copy(
        CopyOpG2S{},
@ -409,7 +409,7 @@ public:

    // Allocate register tensors
    auto tRS_rD_shape = take<0,3>(shape(thread_r2s.partition_S(bEsD)));                            // (R2S,R2S_M,R2S_N)
-    Tensor tRS_rC = make_tensor<ElementC>(tRS_rD_shape);                                           // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rC = make_tensor<InternalElementC>(tRS_rD_shape);                                   // (R2S,R2S_M,R2S_N)
    Tensor tRS_rD = make_tensor<ElementD>(tRS_rD_shape);                                           // (R2S,R2S_M,R2S_N)

    // Vectorized fragment view for thread epilogue op
@ -418,7 +418,7 @@ public:
    Tensor tRS_rD_frg   = recast<typename ThreadEpilogueOp::FragmentOutput>(tRS_rD);

    // Partition for smem to register copy (tSR_)
-    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,ElementC>{}, tiled_r2s);
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,InternalElementC>{}, tiled_r2s);
    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
    Tensor tSR_sC = thread_s2r.partition_S(bEsC);                                      // (S2R,S2R_M,S2R_N,EPI_M,EPI_N)
    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                       //             (S2R,S2R_M,S2R_N)
--- a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
+++ b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
@ -130,6 +130,7 @@ public:
  using ActivationFunctor = ActivationFunctor_<ElementCompute>;

  static constexpr int kCount = 1;
+  static constexpr ScaleType::Kind kScale = Scale;

  using FragmentOutput = Array<ElementOutput, kCount>;
  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@ -323,7 +323,7 @@ public:
      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
  {
-    // Redcuce peer accumulator fragments into one fragment
+    // Reduce peer accumulator fragments into one fragment
    AccumulatorFragment accum_fragment;
    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);

--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@ -190,7 +190,8 @@ struct CollectiveMma<
    "SmemLayoutB K must be 128bytes to be transposed.");
  static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
    "Warp specialized ARF kernels have not supported universal B transposition yet.");
-  static_assert(!TransposeB || shape<0>(TileShape{}) == 64, "Optimized transpose RS kernel requires TileShape M = 64.");
+  static_assert(!TransposeB || !cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+    "Transpose RS kernel requires kernel schedule schmem is not KernelTmaWarpSpecializedCooperative.");

  struct SharedStorage
  {
@ -294,7 +295,7 @@ struct CollectiveMma<

  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
-  static_assert(K_PIPE_MMAS >= 1, "At least one MMA stage should be asynchronous for this mainloop.");
+  static_assert(K_PIPE_MMAS == 0, "no MMA stage should be asynchronous for this mainloop for now.");
  static constexpr uint32_t TmaTransactionBytes =
        (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
        (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
@ -368,21 +369,6 @@ struct CollectiveMma<
        }
      }

-      // Issue the prologue loads
-      int k_tile_prologue = min(k_tile_count, K_PIPE_MAX);
-      CUTLASS_PRAGMA_UNROLL
-      for (int count = 0; count < k_tile_prologue; ++count) {
-        pipeline.producer_acquire(smem_pipe_write);
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-        ++smem_pipe_write;
-      }
-      k_tile_count -= k_tile_prologue;
      // Mainloop
      CUTLASS_PRAGMA_NO_UNROLL
      for ( ; k_tile_count > 0; --k_tile_count) {
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
@ -303,22 +303,6 @@ struct CollectiveMma<
        }
      }

-      // Issue the prologue loads
-      int k_tile_prologue = min(k_tile_count, K_PIPE_MAX);
-      CUTLASS_PRAGMA_UNROLL
-      for (int count = 0; count < k_tile_prologue; ++count) {
-        pipeline.producer_acquire(smem_pipe_write);
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-        ++smem_pipe_write;
-      }
-      k_tile_count -= k_tile_prologue;
-
      // Mainloop
      CUTLASS_PRAGMA_NO_UNROLL
      for ( ; k_tile_count > 0; --k_tile_count)
--- a/include/cutlass/gemm/device/base_grouped.h
+++ b/include/cutlass/gemm/device/base_grouped.h
@ -301,18 +301,19 @@ public:
      return 0;
    }

-    result = cudaGetDeviceProperties(&properties, device_idx);
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_idx);
    if (result != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      result = cudaGetLastError();
-      CUTLASS_TRACE_HOST("  cudaGetDeviceProperties() returned error "
-          << cudaGetErrorString(result));
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
      return 0;
    }

-    bool override_sm_count = (available_sm_count < 0 || available_sm_count > properties.multiProcessorCount);
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
    if (override_sm_count) {
-      available_sm_count = properties.multiProcessorCount;
+      available_sm_count = multiprocessor_count;
    }

    int max_active_blocks = maximum_active_blocks();
@ -440,8 +441,6 @@ public:
    cudaError_t result = cudaGetLastError();

    if (result != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      result = cudaGetLastError();
      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
      return Status::kErrorInternal;
    }
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@ -490,9 +490,9 @@ struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
  static int const kAlignmentA = 1;
  static int const kAlignmentB = 1;
  
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 16>;
+  using ThreadblockShape = GemmShape<128, 128, 16>;
+  using WarpShape = GemmShape<32, 64, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
  static int const kStages = 3;

  using EpilogueOutputOp = epilogue::thread::LinearCombination<
--- a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
+++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@ -31,7 +31,7 @@

 /*! \file
    \brief Template for a GEMM kernel that can broadcast bias vector in the
-           epigloue.
+           epilogue.
 */

 #pragma once
--- a/include/cutlass/gemm/device/gemv_strided_batched.h
+++ b/include/cutlass/gemm/device/gemv_strided_batched.h
@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel_>
+class GemvStridedBatched {
+public:
+
+  using GemvKernel = GemvKernel_;
+
+  using ElementA = typename GemvKernel::ElementA;
+  using LayoutA = typename GemvKernel::LayoutA;
+  using ElementB = typename GemvKernel::ElementB;
+  using ElementC = typename GemvKernel::ElementC;
+
+  using ElementAccumulator = typename GemvKernel::ElementAccumulator;
+
+  using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
+
+  static ComplexTransform const kTransformA = GemvKernel::kTransformA;
+  static ComplexTransform const kTransformB = GemvKernel::kTransformB;
+
+  static int const kThreadCount = GemvKernel::kThreadCount;
+  static int const mThreadCount = GemvKernel::mThreadCount;
+
+  static int const kStages = GemvKernel::kStages;
+
+  static int const kAlignmentA = GemvKernel::kAlignmentA;
+  static int const kAlignmentB = GemvKernel::kAlignmentB;
+  static int const kAlignmentC = GemvKernel::kAlignmentC;
+
+  using Arguments = typename GemvKernel::Arguments;
+  using Params = typename GemvKernel::Params;
+
+private:
+
+  Params params_;
+
+public:
+
+  /// Constructs the Gemv.
+  GemvStridedBatched() {}
+
+  /// Determines whether the Gemv can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+          return GemvKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) { return 0; }
+
+  /// Initializes Gemv state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    params_ = Params(args);
+
+    if (args.problem_size.column() % GemvKernel::kElementsPerAccess) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    return params_.update(args);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    dim3 grid(1, 1, params_.batch_count % 65536);
+    dim3 block(kThreadCount, mThreadCount, 1);
+
+    int smem_size = 0;
+
+    // Launch
+    cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+      Arguments const &args,
+      void *workspace = nullptr,
+      cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+        status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@ -150,7 +150,7 @@ template<
  int Stages_,
  class ClusterShape_ = Shape<_1,_1,_1>,
  class KernelSchedule = KernelTmaWarpSpecialized,
-  int PipelineAsyncMmaStages_ = 1
+  int PipelineAsyncMmaStages_ = 0
 >
 struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
  constexpr static int Stages = Stages_;
--- a/include/cutlass/gemm/kernel/gemv_strided_batched.h
+++ b/include/cutlass/gemm/kernel/gemv_strided_batched.h
@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/cache_operation.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename ElementA_,             /// matrix
+    typename LayoutA_,
+    typename ElementB_,             /// vector
+    typename ElementC_,
+    typename ElementAccumulator_,
+    int kElementsPerAccess_,
+    typename EpilogueOutputOp_
+>
+struct GemvStridedBatched {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  static_assert(std::is_same<LayoutA, LayoutA_>::value,
+                "Only supported for row-major A matrix");
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // number of return elements in a global access
+  static int const kElementsPerAccess = kElementsPerAccess_;
+  
+  using FragmentA = Array<ElementA, kElementsPerAccess>;
+  using FragmentB = Array<ElementB, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
+
+  // thread block shape (kThreadCount, mThreadCount)
+  static int const kThreadCount = std::min(static_cast<int>(128 / (kElementsPerAccess * sizeof(ElementA))), 16);
+  static int const mThreadCount = 128 / kThreadCount;
+
+  // rolling tile shape
+  static int const kTileA = kThreadCount * kElementsPerAccess;
+  static int const mTileA = mThreadCount * 8;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments
+  {
+    MatrixCoord problem_size;
+    int32_t batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC *ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments() : batch_count(0) {}
+
+    Arguments(
+        MatrixCoord problem_size,
+        int32_t batch_count,
+
+        typename EpilogueOutputOp::Params output_op,
+        TensorRefA ref_A,
+        void const *ptr_B,
+        void const *ptr_C,
+        void *ptr_D,
+
+        int64_t batch_stride_A,
+        int64_t batch_stride_B,
+        int64_t batch_stride_C,
+        int64_t batch_stride_D) : problem_size(problem_size),
+                                  batch_count(batch_count),
+                                  output_op(output_op),
+                                  ref_A(ref_A),
+                                  ptr_B(static_cast<ElementB const *>(ptr_B)),
+                                  ptr_C(static_cast<ElementC const *>(ptr_C)),
+                                  ptr_D(static_cast<ElementC *>(ptr_D)),
+
+                                  batch_stride_A(batch_stride_A),
+                                  batch_stride_B(batch_stride_B),
+                                  batch_stride_C(batch_stride_C),
+                                  batch_stride_D(batch_stride_D)
+    {
+    }
+
+    Arguments(
+        MatrixCoord problem_size,
+        typename EpilogueOutputOp::Params output_op,
+        TensorRefA ref_A,
+        void const *ptr_B,
+        void const *ptr_C,
+        void *ptr_D) : Arguments(problem_size,
+                                 1,
+                                 1,
+                                 output_op,
+                                 ref_A,
+                                 ptr_B,
+                                 ptr_C,
+                                 ptr_D,
+                                 1,
+                                 1,
+                                 1,
+                                 1)
+    {
+    }
+
+    Status update(Arguments const &args)
+    {
+      problem_size = args.problem_size;
+      batch_count = args.batch_count;
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage
+  {
+  };
+
+public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemvStridedBatched() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const &problem_size)
+  {
+    if (problem_size.column() % kElementsPerAccess != 0)
+      return Status::kErrorMisalignedOperand;
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args)
+  {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage)
+  {
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z)
+    {
+      int k_col_id = threadIdx.x;
+      int m_row_id = threadIdx.y;
+
+      // problem_size (row = m, column = k)
+      // matrix A (batch, m, k)
+      // vector B (batch, 1, k)
+      // vector C (batch, m, 1)
+      // vector D (batch, m, 1)
+
+      // move in the batch dimension
+      ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
+      ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
+
+      ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
+      ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
+
+      // move in the k dimension
+      ptr_A += k_col_id * kElementsPerAccess;
+      ptr_B += k_col_id * kElementsPerAccess;
+
+      // move in the m dimension
+      ptr_A += m_row_id * params.problem_size.column();
+      ptr_C += m_row_id;
+      ptr_D += m_row_id;
+
+      NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
+      NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
+
+      for (; m_row_id < params.problem_size.row(); m_row_id += mTileA)
+      {
+        ElementAccumulator accum[mTileA / mThreadCount] = {0.f};
+
+        FragmentB fragB;
+        FragmentA fragA[mTileA / mThreadCount];
+
+        int mElemCountPerTile = min(mTileA / mThreadCount, (params.problem_size.row() - m_row_id - 1) / mThreadCount + 1);
+
+        int kUnroll = 0;
+
+        for (; kUnroll < params.problem_size.column() / kTileA * kTileA; kUnroll += kTileA)
+        {
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            // fetch from matrix A
+            arch::global_load<FragmentA,
+                              sizeof(FragmentA),
+                              arch::CacheOperation::LastUse>(fragA[m], (ptr_A + kUnroll + m * mThreadCount * params.problem_size.column()), true);
+          }
+
+          // fetch from vector B
+          arch::global_load<FragmentB,
+                            sizeof(FragmentB),
+                            arch::CacheOperation::Always>(fragB, (ptr_B + kUnroll), true);
+
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            FragmentCompute fragB_Compute = srcB_converter(fragB);
+            FragmentCompute fragA_Compute = srcA_converter(fragA[m]);
+
+            // Math
+            CUTLASS_PRAGMA_UNROLL
+            for (int e = 0; e < kElementsPerAccess; e++)
+            {
+              accum[m] += fragA_Compute.at(e) * fragB_Compute.at(e);
+            }
+          }
+        }
+
+        // calculate the rest of K elements
+        // each thread fetch 1 element each time
+        for (int k = kUnroll + k_col_id; k < params.problem_size.column(); k += kThreadCount)
+        {
+          ElementB b = *(ptr_B - k_col_id * kElementsPerAccess + k);
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            ElementA a = *(ptr_A - k_col_id * kElementsPerAccess + k + m * mThreadCount * params.problem_size.column());
+            accum[m] += ElementAccumulator(a) * ElementAccumulator(b);
+          }
+        }
+
+        EpilogueOutputOp output_op(params.output_op);
+        typename EpilogueOutputOp::FragmentOutput source_fragment[mTileA / mThreadCount];
+
+        // prefetch from source matrix C
+        if (output_op.is_source_needed())
+        {
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            source_fragment[m][0] = *(ptr_C + m * mThreadCount);
+          }
+        }
+
+        typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+        typename EpilogueOutputOp::FragmentOutput output_fragment;
+
+        for (int m = 0; m < mElemCountPerTile; m++)
+        {
+          for (int mask = (kThreadCount >> 1); mask > 0; mask >>= 1)
+          {
+            accum[m] += __shfl_xor_sync(0xFFFFFFFF, accum[m], mask, 32);
+          }
+
+          if (k_col_id == 0)
+          {
+            accum_fragment[0] = accum[m];
+
+            if (output_op.is_source_needed())
+            {
+              output_fragment = output_op(accum_fragment, source_fragment[m]);
+            }
+            else
+            {
+              output_fragment = output_op(accum_fragment);
+            }
+
+            *(ptr_D + m * mThreadCount) = output_fragment[0];
+          }
+        }
+
+        ptr_A += mTileA * params.problem_size.column();
+        ptr_C += mTileA;
+        ptr_D += mTileA;
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@ -166,8 +166,8 @@ public:
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
      return implementable;
    }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
    auto M = get<0>(args.problem_shape);
    auto N = get<1>(args.problem_shape);
    auto K = get<2>(args.problem_shape);
@ -182,7 +182,17 @@ public:
                        N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
    if (!implementable) {
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
+    }
+
    return implementable;
  }

--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@ -173,8 +173,8 @@ public:
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
      return implementable;
    }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
    auto M = get<0>(args.problem_shape);
    auto N = get<1>(args.problem_shape);
    auto K = get<2>(args.problem_shape);
@ -189,7 +189,17 @@ public:
                        N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
    if (!implementable) {
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
+    }
+
    return implementable;
  }

--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@ -196,8 +196,8 @@ public:
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
      return implementable;
    }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
    auto M = get<0>(args.problem_shape);
    auto N = get<1>(args.problem_shape);
    auto K = get<2>(args.problem_shape);
@ -212,7 +212,17 @@ public:
                        N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
    if (!implementable) {
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
+    }
+
    return implementable;
  }

--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@ -204,8 +204,8 @@ public:
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
      return implementable;
    }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
    auto M = get<0>(args.problem_shape);
    auto N = get<1>(args.problem_shape);
    auto K = get<2>(args.problem_shape);
@ -220,7 +220,17 @@ public:
                        N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
    if (!implementable) {
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
+    }
+
    return implementable;
  }

--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@ -163,6 +163,12 @@ public:
      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
      int const max_blk_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % size(cluster_shape));
      int blk_per_device = min_num_gpc * max_blk_occupancy_per_gpc;
+
+      // The calculation below allows for larger grid size launch for different GPUs.
+      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+      int const max_blk_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % size(cluster_shape));
+      blk_per_device += max_blk_occupancy_per_residual_gpc;
+
      blk_per_device = sm_count < blk_per_device ? sm_count : blk_per_device;

      launch_grid.x = std::min(
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@ -630,9 +630,6 @@ struct ThreadblockSwizzleStreamK {
  }


-// Guards needed for PyCUTLASS library generation
-#if !defined(CUTLASS_PYTHON_HOST_CC)
-
  //
  // Device-side interface
  //
@ -692,7 +689,7 @@ struct ThreadblockSwizzleStreamK {
    return GemmCoord(m, n, get_batch_idx());
  }

-  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rastorization)
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
  CUTLASS_DEVICE
  GemmCoord get_tile_offset_row_major(int tile_idx) const
  {
@ -740,7 +737,7 @@ struct ThreadblockSwizzleStreamK {
    div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);

    int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region;   // number of iterations in the region's big blocks
-    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal bocks
+    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal blocks

    int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
    int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
@ -794,8 +791,6 @@ struct ThreadblockSwizzleStreamK {
    return get_sk_block_idx(iter);
  }

-#endif // !defined(CUTLASS_PYTHON_HOST_CC)
-
 };

 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@ -450,7 +450,7 @@ private :
  CUTLASS_DEVICE
  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
    if (barrier_token == BarrierStatus::WaitAgain) {
-      consumer_wait(stage, phase);
+      full_barrier_ptr_[stage].wait(phase);
    }
  }

@ -654,7 +654,7 @@ public :
    consumer_release(state.index());
  }

-protected:
+private:
  FullBarrier *full_barrier_ptr_ = nullptr;
  EmptyBarrier *empty_barrier_ptr_ = nullptr;
  Params params_;
@ -976,6 +976,11 @@ public:
    ++stage_;
  }

+  CUTLASS_DEVICE
+  void advance() {
+    ++stage_;
+  }
+
 private:

  CUTLASS_DEVICE
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@ -89,19 +89,16 @@ public:
  /// Waits until the semaphore is equal to the given value
  CUTLASS_DEVICE
  void wait(int status = 0) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    while( __syncthreads_and(state != status) ) {
      fetch();
    }

    __syncthreads();
-#endif
  }

  /// Updates the lock with the given result
  CUTLASS_DEVICE
  void release(int status = 0) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
    __syncthreads();

    if (wait_thread) {
@ -111,7 +108,6 @@ public:
      asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
      #endif
    }
-#endif
  }
 };

--- a/media/docs/cute/01_layout.md
+++ b/media/docs/cute/01_layout.md
@ -3,8 +3,8 @@
 ## Layout

 This document describes `Layout`, CuTe's core abstraction.
-A `Layout` maps from (a) logical coordinate space(s)
-to a physical index space.
+A `Layout` maps from a logical coordinate space
+to an index space.

 `Layout`s present a common interface to multidimensional array access
 that abstracts away the details of how the array's elements are organized in memory.
@ -19,7 +19,11 @@ This can help users do things like partition layouts of data over layouts of thr

 ## Layouts and Tensors

-Any of the `Layout`s discussed in this section can be composed with data -- a pointer or an array -- to create a `Tensor`. The responsibility of the `Layout` is to define valid coordinate space(s) and, therefore, the logical shape of the data and map those into an index space. The index space is precisely the offset that would be used to index into the array of data.
+Any of the `Layout`s discussed in this section can be composed with data -- e.g., a pointer or an array -- to create a `Tensor`.
+The `Layout`'s logical coordinate space represents the logical "shape" of the data,
+e.g., the modes of the `Tensor` and their extents.
+The `Layout` maps a logical coordinate into an index,
+which is an offset to be used to index into the array of data.

 For details on `Tensor`, please refer to the
 [`Tensor` section of the tutorial](./03_tensor.md).
@ -31,31 +35,31 @@ Both `Shape` and `Stride` are `IntTuple` types.

 ### IntTuple

-An `IntTuple` is an integer or a tuple of `IntTuple`s.
+An `IntTuple` is defined recursively as either a single integer, or a tuple of `IntTuple`s.
 This means that `IntTuple`s can be arbitrarily nested.
 Operations defined on `IntTuple`s include the following.

-* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`. Note that `get<0>` is defined for integer `IntTuples`.
+* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`.  For an `IntTuple` consisting of a single integer, `get<0>` is just that integer.

-* `rank(IntTuple)`: The number of elements in an `IntTuple`. An int has rank 1, a tuple has rank `tuple_size`.
+* `rank(IntTuple)`: The number of elements in an `IntTuple`. A single integer has rank 1, and a tuple has rank `tuple_size`.

-* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc.
+* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.

-* `size(IntTuple)`: The product of all elements of the IntTuple.
+* `size(IntTuple)`: The product of all elements of the `IntTuple`.

-We write `IntTuple`s with parenthesis to denote the hierarchy. E.g. `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s.
+We write `IntTuple`s with parenthesis to denote the hierarchy. For example, `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s.

 ## Layout

-A `Layout` is then a pair of `IntTuple`s. The first defines the abstract *shape* of the layout and the second defines the *strides*, which map from coordinates within the shape to the index space.
+A `Layout` is then a pair of `IntTuple`s. The first element defines the abstract *shape* of the `Layout`, and the second element defines the *strides*, which map from coordinates within the shape to the index space.

-As a pair of `IntTuple`s, we can define many similar operations on `Layout`s including
+Since a `Layout` is just a pair of `IntTuple`s, we can define operations on `Layout`s analogous to those defined on `IntTuple`.

 * `get<I>(Layout)`: The `I`th sub-layout of the `Layout`.

 * `rank(Layout)`: The number of modes in a `Layout`.

-* `depth(Layout)`: The number of hierarchical `Layout`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc.
+* `depth(Layout)`: The number of hierarchical `Layout`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.

 * `shape(Layout)`: The shape of the `Layout`.

@ -86,7 +90,7 @@ These hierarchical access functions include the following.

 ### Vector examples

-Then, we can define a vector as any `Shape` and `Stride` pair with `rank == 1`.
+We define a vector as any `Shape` and `Stride` pair with `rank == 1`.
 For example, the `Layout`

 ```
@ -95,9 +99,9 @@ Stride: (1)
 ```

 defines a contiguous 8-element vector.
-Similarly, with a stride of `(2)`,
+For a vector with the same Shape but a Stride of `(2)`,
 the interpretation is that the eight elements
-are stored at positions 0, 2, 4, $\dots$.
+are stored at positions 0, 2, 4, $\dots$, 14.

 By the above definition, we *also* interpret

@ -168,9 +172,17 @@ auto layout_2x4 = make_layout(make_shape (2, make_shape (2,2)),
                              make_stride(4, make_stride(2,1)));
 ```

+The `make_layout` function returns a `Layout`.
+It deduces the returned `Layout`'s template arguments from the function's arguments.
+Similarly, the `make_shape` and `make_stride` functions
+return a `Shape` resp. `Stride`.
+CuTe often uses these `make_*` functions,
+because constructor template argument deduction (CTAD)
+does not work for `cute::tuple` as it works for `std::tuple`.
+
 ## Using a `Layout`

-The fundamental use of a `Layout` is to map between logical coordinate space(s) and index space. For example, to print an arbitrary rank-2 layout, we can write the function
+The fundamental use of a `Layout` is to map between logical coordinate space(s) and an index space. For example, to print an arbitrary rank-2 layout, we can write the function

 ```c++
 template <class Shape, class Stride>
--- a/media/docs/cute/02_layout_operations.md
+++ b/media/docs/cute/02_layout_operations.md
@ -73,6 +73,7 @@ In C++, we identify a Tuple with the
 `cute::tuple` behaves like `std::tuple`, but it works on device or host,
 and it imposes restrictions on its template arguments for performance and simplicity.

+
 #### IntTuple

 CuTe then defines an IntTuple as either an integer, or a Tuple of IntTuple.
@ -136,7 +137,7 @@ This code produces the following text output.
 ```

 `print(layout(1, 1))` prints the mapping of
-the logical 2-D coordinate (1,1) to 1-D index, which is 4.
+the logical 2-D coordinate (1,1) to the 1-D index, which is 4.
 You can see that from the table,
 which shows the left logical index as the "row,"
 and the right logical index as the "column."
@ -302,13 +303,13 @@ Both humans and CuTe compute composition using the following rules.

 2. Concatenation: A layout can be expressed as the concatenation of its sublayouts.  We denote concatenation with parentheses: $B = (B_0,B_1,...)$.  The CuTe function `make_layout`, when given zero or more `Layout`s, concatenates them.

-3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B0, B1, ...) = (A \circ B0, A \circ B1, ...)$.
+3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B_0, B_1, ...) = (A \circ B_0, A \circ B_1, ...)$.

 4. "Base case": For layouts $A = a : b$ and $B = c : d$ with integral shape and stride, $A \circ B = R = c : (b * d)$.

 5. By-mode composition: Let $\langle B, C \rangle$ (angle brackets, not parentheses)
-   denote a tuple of two layouts B and C, not their concatenation.  Let A = (A0, A1).
-   Then, $A \circ \langle B, C \rangle = (A0, A1) \circ \langle B, C \rangle = (A0 \circ B, A1 \circ C)$.
+   denote a tuple of two layouts B and C, not their concatenation.  Let $A = (A_0, A_1)$.
+   Then, $A \circ \langle B, C \rangle = (A_0, A_1) \circ \langle B, C \rangle = (A_0 \circ B, A_1 \circ C)$.
   This allows the application of composition independently to sublayouts of $A$.

 #### Examples: Reshape a vector into a matrix
@ -359,6 +360,55 @@ The resulting layout has shape $(4,5)$, just as before.  What are the strides?

 5. Result: (4:10, 5:2), which by concatenation is (4,5) : (10,2).

+#### Example: Reshape a matrix into another matrix
+
+The composition $((20,2):(16,4) \circ (4,5):(1,4))$
+expresses reshaping the matrix with layout (20,2):(16:4),
+into a 4 x 5 matrix in a column-major way.
+
+1. By deconcatenation, $(4,5) : (1,4)$ is $(4:1, 5:4)$.
+
+2. Composition is distributive, so $(20,2):(16,4) \circ (4:1, 5:4)$ is $((20,2):(16,4) \circ 4:1, (20,2):(16,4) \circ 5:4)$.
+
+3. $(20,2):(16,4) \circ 4:1$ has shape $4$ and stride $16$.  (4:1 expresses picking the first 4 consecutive elements of (20,2):(16,4).  These elements run down the 0th column (leftmost mode) of the layout, whose stride is 16.)
+
+4. $(20,2):(16,4) \circ 5:4$ has shape $5$ and stride $64 = 4 \cdot 16$.
+
+5. Result: $(4:16, 5:64)$, which by concatenation is $(4,5) : (16,64)$.
+
+We get exactly this result with CuTe
+if we use compile-time shapes and strides.
+The following C++ code prints `(_4,_5):(_16,_64).`
+
+```c++
+using namespace cute;
+auto a = make_layout(make_shape(Int<20>{}, _2{}), make_stride(_16{}, _4{}));
+auto b = make_layout(make_shape(     _4{}, _5{}), make_stride( _1{}, _4{}));
+auto c = composition(a, b);
+printf("\n");
+print(c);
+```
+
+Results may _look_ different (but are the same mathematically)
+if we use run-time integers.
+The following C++ code prints `((4,1),(5,1)):((16,4),(64,4)).`
+
+```c++
+using namespace cute;
+auto a = make_layout(make_shape(20, 2), make_stride(16, 4));
+auto b = make_layout(make_shape( 4, 5), make_stride( 1, 4));
+auto c = composition(a, b);
+printf("\n");
+print(c);
+```
+
+((4,1),(5,1)):((16,4),(64,4)) is effectively the same layout
+as (4,5) : (16,64), because the 1s in the shape don't affect the layout
+(as a mathematical function from one integer to one integer).
+CuTe chooses not to simplify layout computations
+with run-time values in them as much as it could,
+because simplifications involving run-time values have a run-time cost.
+
 ### Product

 CuTe includes four different kinds of layout products.
@ -428,7 +478,7 @@ results in Shape ((2, 2), (3, 4)) and Stride ((1, 2), (16, 4)).
 | (1,1) |  3    | 19    | 35    |  7    | 23    | 39    | 11    | 27    | 43    | 15    | 31    | 47    |

 Note how the tile appears in the leftmost column and is reproduced
-in each column in the same order as the matrix-of-tiles. That is, 
+in each column in the same order as the matrix-of-tiles. That is,
 the tile can be indexed through the first mode of the result and the
 matrix-of-tiles can be indexed through the second mode.

@ -456,8 +506,8 @@ Shape ((3, 2), (4, 2)) and Stride ((16, 1), (4, 2)).
 | (2,1) | 33    | 37    | 41    | 45    | 35    | 39    | 43    | 47    |

 The tile is now interleaved or "raked" with the other 3x4 matrix-of-tiles
-instead of appearing as blocks. Other references call this cyclic 
-distribution.
+instead of appearing as blocks. Other references call this a "cyclic
+distribution."

 This might look familiar if you have ever used ScaLAPACK.
 It expresses a 2-D block cyclic distribution of a 6 x 8 matrix
@ -542,7 +592,87 @@ CuTe includes 3 different kinds of layout division operations.

 We will summarize these in the sections that follow.

-#### Logical divide : the intuitive tiling
+#### Logical divide
+
+##### Example worked in detail
+
+This section will work the following logical divide example in detail.
+
+```c++
+Layout a = make_layout(24, 2);
+Layout b = make_layout( 4, 2);
+Layout c = logical_divide(a, b);
+```
+
+Logical divide produces a rank-2 `Layout`,
+where mode 0 (the leftmost mode) corresponds to the divisor `b`,
+and mode 1 (the rightmost mode) corresponds to the "remainder."
+Intuitively, the remainder of 24 divided by 4 is 6,
+so we know that mode 1 has 6 elements.
+We just don't know its shape yet.
+
+CuTe defines `logical_divide(a, b)` as
+`composition(a, make_layout(b, complement(b, size(a))))`.
+Here, `size(a)` is 24.
+What is `complement(b, 24)`?
+Intuitively, it means "the remainder,"
+what's left over after applying `b` to 0, 1, 2, $\dots$, 23.
+
+The layout 4:2 means "take 4 elements at even-numbered indices."
+The following table overlays the range of 4:2
+atop the complement's codomain 0, 1, $\dots$, 23.
+
+| Range of 4:2  | 0     |       | 2     |       | 4     |       | 6     |     |     |     |         |     |
+| ---           | ---   | ---   | ---   | ---   | ---   | ---   | ---   | --- | --- | --- | ---     | --- |
+| Codomain      | 0     | 1     | 2     | 3     | 4     | 5     | 6     | 7   | 8   | 9   | $\dots$ | 23  |
+
+Layouts are linear, so their range must include zero.
+The complement of 4:2 with respect to 24 is thus a layout whose range
+
+* includes zero;
+
+* does not include any other elements of the range of 4:2
+    (i.e., satisfies the disjoint property; see above); and
+
+* includes as much of 0, 1, $\dots$, 23 as possible
+    (so that it forms the "remainder" of 4:2 with respect to 24).
+
+Intuitively, the range of the complement must look like this:
+0, 1, 8, 9, 16, 17.
+The resulting layout is ordered.
+It has size 6 and cosize 18,
+so it satisfies the bounded property (see above).
+This is the layout (2, 3) : (1, 8).
+(Going from this intuitive sense of the complement
+to knowing how to compute it directly
+is out of scope for this part of the tutorial.)
+
+The following table shows 4:2 with its complement (2, 3) : (1, 8).
+
+| Range of 4:2        | 0     |       | 2     |       | 4     |       | 6     |     |     |     |     |     |     |     |     |     |     |     |         |     |
+| ---                 | ---   | ---   | ---   | ---   | ---   | ---   | ---   | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---     | --- |
+| Codomain            | 0     | 1     | 2     | 3     | 4     | 5     | 6     | 7   | 8   | 9   | 10  | 11  | 12  | 13  | 14  | 15  | 16  | 17  | $\dots$ | 23  |
+| ---                 | ---   | ---   | ---   | ---   | ---   | ---   | ---   | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---     | --- |
+| Range of complement | 0     | 1     |       |       |       |       |       |     | 8   | 9   |     |     |     |     |     |     | 16  | 17  |         |     |
+
+Now we know that `logical_divide`(24:2, 4:2) is
+`composition`(24:2, `make_layout`(4:2, (2,3):(1,8))).
+The composition of two layouts has the shape of the second (rightmost) layout,
+so the resulting shape is (4, (2, 3)).
+We see that the leftmost mode 4 corresponds to the divisor 4:2,
+and the rightmost mode (2, 3) describes what's "left over"
+from the original shape 24.
+
+What are the strides?
+We can start from the leftmost mode.
+4:2 takes every other element (the even-numbered elements) of 24:2.
+That's a stride-2 thing, striding over a stride-2 thing.
+The resulting stride is 4.
+Similarly, the stride 2 of 24:2
+doubles the two strides of the rightmost mode.
+The resulting layout is (4, (2, 3)) : (4, (2, 16)).
+
+##### Tiling example

 Suppose I have the 6 x 8 matrix from the Raked Product section
 and want to "collect" the `tile`, turning the Raked Product into
@ -607,7 +737,7 @@ Note that this is the same layout as the result in the Logical Product section.
 That is, the first mode is our original tile (and can be interpreted as a 2x2 matrix itself)
 and the second mode is its logical layout within the raked layout.

-##### More Examples of Divide
+#### More Examples of Divide

 For brevity, shapes can be used with `logical_divide` and `tiled_divide` to quickly split and tile modes of a tensor. For example, this C++ code

--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@ -200,7 +200,7 @@ class ArtifactManager:
            self.compiled_cache_host.insert(key, compiled_host_fns)
        return True

-    def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
+    def emit_compile_(self, operation_list, compilation_options):
        """
        Compile a list of kernels and store them into database
        """
@ -306,41 +306,17 @@ class ArtifactManager:
                cubin_image = file.read()

        # Set up the host-side library code
-        if requires_nvcc_hostlib_compilation:
-            cmd_template = (
-                "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
-                % source_buffer_host
-            )
-            cmd = SubstituteTemplate(
-                cmd_template,
-                {
-                    "cuda_install_path": CUDA_INSTALL_PATH,
-                    "options": compilation_options.get_str(),
-                },
-            )
-        else:
-            options = compilation_options.get()
-            cmd = (
-                "echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1"
-                % source_buffer_host
-            )
-            filtered_opts = [
-                "-default-device",
-                "-Xcicc",
-                "-Xllc",
-                "--expt-relaxed-constexpr",
-                "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
-            ]
-            for opt in options:
-                opt = opt.decode("utf-8")
-                if opt not in filtered_opts and "-arch=sm_" not in opt:
-                    if "--include-path=" in opt:
-                        cmd += " " + opt.replace(
-                            "--include-path=",
-                            "-I",
-                        )
-                    else:
-                        cmd += " " + opt
+        cmd_template = (
+            "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
+            % source_buffer_host
+        )
+        cmd = SubstituteTemplate(
+            cmd_template,
+            {
+                "cuda_install_path": CUDA_INSTALL_PATH,
+                "options": compilation_options.get_str(),
+            },
+        )

        tempfile.tempdir = "./"
        temp = tempfile.NamedTemporaryFile(
@ -375,7 +351,6 @@ class ArtifactManager:
        # save the cubin
        operation_key = []
        operation_list = []
-        requires_nvcc_hostlib_compilation = False
        for operation in operations:
            # step 1: get kernel string as key
            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
@ -398,17 +373,9 @@ class ArtifactManager:
                operation_list.append(operation.rt_module)
                operation_key.append(key)

-            # Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
-            # the PyCUTLASS host-side library. Otherwise, g++ will be used.
-            if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x:
-                if self.backend == "nvrtc":
-                    raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.")
-
-                requires_nvcc_hostlib_compilation = True
-
        if len(operation_list) > 0:
            cubin_image, host_lib, host_file = self.emit_compile_(
-                operation_list, compile_options, requires_nvcc_hostlib_compilation)
+                operation_list, compile_options)

            err, module = cuda.cuModuleLoadData(cubin_image)
            if err != cuda.CUresult.CUDA_SUCCESS:
--- a/python/cutlass/cpp/include/tensor_ref_view.h
+++ b/python/cutlass/cpp/include/tensor_ref_view.h
@ -43,10 +43,10 @@
 template<typename T, typename L, typename TF>
 void bind_tensor_ref_view(py::module &m, std::string name) {
    py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
-        .def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
+        .def(py::init([](int64_t address, const L& layout_ ) {
            T* ptr = reinterpret_cast< T*>(address);
-            new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
-        })
+            return new cutlass::TensorRef<T, L>(ptr, layout_);
+        }))
        .def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
            T* ptr = tensor_ref.data();
            return int64_t(ptr);
--- a/python/setup.py
+++ b/python/setup.py
@ -29,9 +29,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 #################################################################################################
-
+import copy
 import os
+from pybind11.setup_helpers import Pybind11Extension
+import setuptools
 from setuptools import setup
+from setuptools.command.build_ext import build_ext


 def _cutlass_path_from_dir() -> str:
@ -61,31 +64,57 @@ cutlass_path = (
    else _cutlass_path_from_dir()
 )

+
 cuda_install_path = (
    os.getenv('CUDA_INSTALL_PATH')
    if os.getenv('CUDA_INSTALL_PATH') is not None
    else _cuda_install_path_from_nvcc()
 )

-ext_modules = []

-try:
-    from pybind11.setup_helpers import Pybind11Extension, build_ext
-    include_dirs = [
-        cutlass_path + '/include',
-        cuda_install_path + '/include',
-        cutlass_path + '/tools/util/include',
-        cutlass_path + '/test',
-    ]
+class BuildExtension(build_ext):
+    """
+    Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)

-    ext_modules = [
-        Pybind11Extension('cutlass_bindings',
-                          ['cutlass/cpp/cutlass_bindings.cpp'],
-                          include_dirs=include_dirs,
-                          extra_compile_args=['-fpermissive', '-w', '-std=c++17', '-DCUTLASS_PYTHON_HOST_CC=1'])
-    ]
-except ImportError:
-    pass
+    def build_extensions(self):
+        original_compile = self.compiler._compile
+
+        def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            """
+            Wrapper around build_ext.compiler._compile method
+            """
+            postargs = copy.deepcopy(extra_postargs)
+            postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']]
+            postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu'])
+            try:
+                original_compiler = self.compiler.compiler_so
+                self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc'])
+                original_compile(obj, src, ext, cc_args, postargs, pp_opts)
+            finally:
+                self.compiler.set_executable('compiler_so', original_compiler)
+
+        self.compiler._compile = custom_compile
+        super().build_extensions()
+
+
+include_dirs = [
+    cutlass_path + '/include',
+    cuda_install_path + '/include',
+    cutlass_path + '/tools/util/include',
+    cutlass_path + '/test',
+]
+
+
+ext_modules = [
+    Pybind11Extension('cutlass_bindings',
+                      ['cutlass/cpp/cutlass_bindings.cpp'],
+                      include_dirs=include_dirs,
+                      extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'],
+                      libraries=['cudart'])
+]


 setup(
@ -103,4 +132,7 @@ setup(
        'treelib'
        ],
    ext_modules=ext_modules,
+    cmdclass={
+                 'build_ext': BuildExtension
+             }
 )
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@ -41,6 +41,7 @@ add_custom_target(
  cutlass_test_unit_gemm_device_tensorop_planar_complex
  cutlass_test_unit_gemm_device_sparse_tensorop_sm80
  cutlass_test_unit_gemv_device
+  cutlass_test_unit_gemv_device_strided_batched
  cutlass_test_unit_gemm_device_tensorop_sm90
  cutlass_test_unit_gemm_device_tensorop_cluster_multicast_sm90
 )
@ -60,6 +61,7 @@ add_custom_target(
  test_unit_gemm_device_tensorop_planar_complex
  test_unit_gemm_device_sparse_tensorop_sm80
  test_unit_gemv_device
+  test_unit_gemv_device_strided_batched
  test_unit_gemm_device_tensorop_sm90
 )

@ -498,6 +500,15 @@ cutlass_test_unit_add_executable(
  gemv.cu
 )

+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemv_device_strided_batched
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemv_strided_batched.cu
+)
+
 if (NOT CUDA_COMPILER MATCHES "[Cc]lang")

 add_dependencies(
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@ -77,7 +77,8 @@ struct TestbedImpl {
  using StrideA  = typename Gemm::GemmKernel::StrideA;
  using ElementB = typename Gemm::GemmKernel::ElementB;
  using StrideB  = typename Gemm::GemmKernel::StrideB;
-  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using ElementC = std::conditional_t<std::is_void_v<typename Gemm::GemmKernel::ElementC>,
+                    typename Gemm::GemmKernel::ElementD,typename Gemm::GemmKernel::ElementC>;
  using StrideC  = typename Gemm::GemmKernel::StrideC;
  using ElementD = typename Gemm::GemmKernel::ElementD;
  using StrideD  = typename Gemm::GemmKernel::StrideD;
--- a/test/unit/gemm/device/gemv_strided_batched.cu
+++ b/test/unit/gemm/device/gemv_strided_batched.cu
@ -0,0 +1,490 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide strided batched GEMV interface
+*/
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/gemv_strided_batched.h"
+#include "cutlass/gemm/device/gemv_strided_batched.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+#include "testbed_utils.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+
+template <typename GemvStridedBatched>
+class TestbedStridedBatchedGemv
+{
+public:
+
+  using ElementA = typename GemvStridedBatched::ElementA;
+  using LayoutA = typename GemvStridedBatched::LayoutA;
+  using ElementB = typename GemvStridedBatched::ElementB;
+  using ElementC = typename GemvStridedBatched::ElementC;
+
+  using ElementAccumulator = typename GemvStridedBatched::ElementAccumulator;
+  using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute;
+
+  using LayoutV = cutlass::layout::RowMajor;
+
+private:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutV> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutV> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutV> tensor_D;
+  cutlass::HostTensor<ElementC, LayoutV> reference_D;
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedStridedBatchedGemv(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 2023):
+      init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename GemvStridedBatched::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count
+  ) {
+
+    //
+    // Allocate the GEMV workspace
+    //
+
+    tensor_A.resize({batch_count * problem_size.row(), problem_size.column()});
+    tensor_B.resize({batch_count * problem_size.column(), 1});
+    tensor_C.resize({batch_count * problem_size.row(), 1});
+    tensor_D.resize({batch_count * problem_size.row(), 1});
+    reference_D.resize({batch_count * problem_size.row(), 1}, false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 1));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 3));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename GemvStridedBatched::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename GemvStridedBatched::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename GemvStridedBatched::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }  
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::MatrixCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+    
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      std::ofstream file("testbed_universal_errors.txt");
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result
+  bool verify(
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex<
+        typename GemvStridedBatched::ElementA, typename GemvStridedBatched::LayoutA,
+        typename GemvStridedBatched::ElementB, LayoutV,
+        typename GemvStridedBatched::ElementC, LayoutV, 
+        ElementCompute, ElementAccumulator
+    >(
+      {problem_size.row(), 1, problem_size.column()},
+      alpha, 
+      tensor_A.host_ref(),
+      GemvStridedBatched::kTransformA,
+      tensor_B.host_ref(),
+      GemvStridedBatched::kTransformB,
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(), 
+      ElementAccumulator(0),
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Runs one problem size
+  bool run(
+    cutlass::MatrixCoord problem_size, 
+    int32_t batch_count,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    this->initialize(problem_size, batch_count);
+
+    //
+    // Initialize the GEMV operator
+    //
+
+    typename GemvStridedBatched::Arguments arguments{
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_ref(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    };
+
+    GemvStridedBatched gemm_op;
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    size_t workspace_size = GemvStridedBatched::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMV
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(
+        problem_size,
+        batch_count,
+        batch_stride_A,
+        batch_stride_B,
+        batch_stride_C,
+        batch_stride_D,
+        alpha,
+        beta);
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvStridedBatched> 
+bool TestAllGemv() {
+
+  using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute;
+
+  int Batch[] = {
+    1, 520, 1314
+  };
+
+  int M[] = {
+    1, 5, 16
+  };
+
+  int K[] = {
+    8, 128, 256
+  };
+
+  double Alpha[] = {
+    1, 1.25
+  };
+
+  double Beta[] = {
+    0, 1, 1.25
+  };
+
+  for (int b : Batch) {
+    for (int m : M) {
+      for (int k : K) {
+        for (double alpha : Alpha) {
+          for (double beta : Beta) {
+
+            TestbedStridedBatchedGemv<GemvStridedBatched> testbed;
+
+            if (!testbed.run(
+                    {m, k},
+                    b,
+                    m * k,
+                    k,
+                    m,
+                    m,
+                    ElementCompute(alpha),
+                    ElementCompute(beta))) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_StridedBatchedGemv_f16n_f16_f16_simt_f32, Simple) {
+
+  using ElementInput = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = float;
+  int const kElementsPerAccess = 8;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
+      cutlass::gemm::kernel::GemvStridedBatched<
+          ElementInput,       // Element A
+          LayoutA,            // Layout A
+          ElementInput,       // Element B
+          ElementOutput,      // Element C
+          ElementAccumulator, // Element accumulator
+          kElementsPerAccess,      // Element access granularity
+          EpilogueOp          // Output operator
+          >>;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_StridedBatchedGemv_f32n_f32_f32_simt_f32, Simple) {
+
+  using ElementInput = float;
+  using ElementOutput = float;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = float;
+  int const kElementsPerAccess = 4;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
+      cutlass::gemm::kernel::GemvStridedBatched<
+          ElementInput,       // Element A
+          LayoutA,            // Layout A
+          ElementInput,       // Element B
+          ElementOutput,      // Element C
+          ElementAccumulator, // Element accumulator
+          kElementsPerAccess,      // Element access granularity
+          EpilogueOp          // Output operator
+          >>;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_StridedBatchedGemv_f64n_f64_f64_simt_f64, Simple) {
+
+  using ElementInput = double;
+  using ElementOutput = double;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = double;
+  int const kElementsPerAccess = 2;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
+      cutlass::gemm::kernel::GemvStridedBatched<
+          ElementInput,       // Element A
+          LayoutA,            // Layout A
+          ElementInput,       // Element B
+          ElementOutput,      // Element C
+          ElementAccumulator, // Element accumulator
+          kElementsPerAccess,      // Element access granularity
+          EpilogueOp          // Output operator
+          >>;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/gemm_operation.py
+++ b/tools/library/scripts/gemm_operation.py
@ -182,9 +182,9 @@ class GemmOperation:
          ar = self.arch,
          op = opcode_class_name,
          ex = self.extended_name_3x(),
-          tbm = self.tile_description.threadblock_shape[0],
-          tbn = self.tile_description.threadblock_shape[1],
-          tbk = self.tile_description.threadblock_shape[2],
+          tbm = self.tile_description.tile_shape[0],
+          tbn = self.tile_description.tile_shape[1],
+          tbk = self.tile_description.tile_shape[2],
          cm = self.tile_description.cluster_shape[0],
          cn = self.tile_description.cluster_shape[1],
          ck = self.tile_description.cluster_shape[2],
@ -640,7 +640,7 @@ class EmitGemmUniversal3xInstance:
 using ${operation_name}_epilogue =
  typename cutlass::epilogue::collective::CollectiveBuilder<
    ${arch}, ${opcode_class},
-    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
    cutlass::epilogue::collective::EpilogueTileAuto,
    ${element_accumulator}, ${element_epilogue},
@ -655,7 +655,7 @@ using ${operation_name}_mainloop =
    ${element_a}, ${layout_a}, ${align_a},
    ${element_b}, ${layout_b}, ${align_b},
    ${element_accumulator},
-    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
    cutlass::gemm::collective::StageCountAutoCarveout<
      sizeof(typename ${operation_name}_epilogue::SharedStorage)>,
@ -686,14 +686,14 @@ ${compile_guard_end}
  #
  def emit(self, operation):

-    threadblock_shape = operation.tile_description.threadblock_shape
+    tile_shape = operation.tile_description.tile_shape
    warp_count = operation.tile_description.warp_count
    # stage count set to zero indicates builder automatic stage selection
    if operation.tile_description.stages > 0:
      stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
    else:
      stage_count_string = "cutlass::gemm::collective::StageCountAuto"
-    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)]

    instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
      (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
@ -727,9 +727,9 @@ ${compile_guard_end}
      'element_accumulator': DataTypeTag[operation.accumulator_type()],
      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
      'arch': "cutlass::arch::Sm%d" % operation.arch,
-      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
-      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
-      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'tile_shape_m': str(operation.tile_description.tile_shape[0]),
+      'tile_shape_n': str(operation.tile_description.tile_shape[1]),
+      'tile_shape_k': str(operation.tile_description.tile_shape[2]),
      'cluster_m': str(operation.tile_description.cluster_shape[0]),
      'cluster_n': str(operation.tile_description.cluster_shape[1]),
      'cluster_k': str(operation.tile_description.cluster_shape[2]),
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@ -91,22 +91,21 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \

 # Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
 def CreateGemmUniversal3xOperator(
-    manifest, layouts, tile_descriptions, data_type,
+    manifest, layouts, tile_descriptions, data_types,
    schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
    complex_transforms=None,
    epilogue_functor=EpilogueFunctor.LinearCombination,
    swizzling_functor=SwizzlingFunctor.Identity1):

+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedules:
+    assert(len(s) == 2)
+
  if complex_transforms is None:
    complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]

-  element_a        = data_type["a_type"]
-  element_b        = data_type["b_type"]
-  element_c        = data_type["c_type"]
-  element_d        = data_type["d_type"]
-  element_acc      = data_type["acc_type"]
-  element_epilogue = data_type.get("epi_type", element_acc)
-
  operations = []

  # by default, only generate the largest tile and largest alignment
@ -115,23 +114,25 @@ def CreateGemmUniversal3xOperator(

  for layout in layouts:
    for tile_description in tile_descriptions:
-      for complex_transform in complex_transforms:
-        for kernel_schedule, epilogue_schedule in schedules:
-          A = TensorDescription(
-              element_a, layout[0][0], layout[0][1], complex_transform[0])
-          B = TensorDescription(
-              element_b, layout[1][0], layout[1][1], complex_transform[1])
+      for data_type in data_types:
+        for complex_transform in complex_transforms:
+          for kernel_schedule, epilogue_schedule in schedules:
+            A = TensorDescription(
+                data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
+            B = TensorDescription(
+                data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])

-          C = TensorDescription(element_c, layout[2][0], layout[2][1])
-          D = TensorDescription(element_d, layout[2][0], layout[2][1])
+            C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
+            D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])

-          operation = GemmOperation(
-              GemmKind.Universal3x, tile_description.minimum_compute_capability,
-              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, D,
-              kernel_schedule, epilogue_schedule)
+            element_compute = data_type.get("epi_type", data_type["acc_type"])
+            operation = GemmOperation(
+                GemmKind.Universal3x, tile_description.minimum_compute_capability,
+                tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
+                kernel_schedule, epilogue_schedule)

-          manifest.append(operation)
-          operations.append(operation)
+            manifest.append(operation)
+            operations.append(operation)

  return operations

@ -4118,21 +4119,19 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
        layout[2][1] = 8

    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
-      kernel_schedules = [
-        KernelScheduleType.ScheduleAuto,
-        KernelScheduleType.TmaWarpSpecializedCooperative,
-        KernelScheduleType.TmaWarpSpecializedPingpong,
-        KernelScheduleType.TmaWarpSpecialized
+      schedules = [
+        [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+        [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
+        [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
+        [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
      ]
    else:
-      kernel_schedules = [
-        KernelScheduleType.ScheduleAuto,
-        KernelScheduleType.TmaWarpSpecialized
+      schedules = [
+        [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+        [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
        # TmaWarpSpecializedCooperative and TmaWarpSpecializedPingpong require CUDA version >= 12.1 for optimal performance.
      ]

-    schedules = [[s, EpilogueScheduleType.ScheduleAuto] for s in kernel_schedules]
-
    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)

    # persistent kernels with TMA epilogues
@ -4140,6 +4139,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
        [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
         [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+      # Emit instance without C allocation+load
+      data_type["c_type"] = DataType.void
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+         [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])

    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@ -4166,6 +4170,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
          [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
           [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+        # Emit instance without C allocation+load
+        data_type_mixed["c_type"] = DataType.void
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
+          [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+           [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])

 #
 def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
@ -4212,19 +4221,32 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
    "acc_type" : math_inst.element_accumulator,
    "epi_type" : math_inst.element_accumulator
  }
+
+  schedules = [
+    [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+    [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
+  ]
+
+  # TMA kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
+  # because they use NN kernels underneath and transposing its epilogue will get the correct output
+  schedules_transposed_epilogue = [
+    [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
+    [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
+  ]
+
  # TMA kernels with TN or NN layout
  layouts_tf32_tn_nn = [layouts_tf32[0], layouts_tf32[2]]
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32, schedules)

  # TMA kernels with NT layout, only support 64x128x32 tile for now.
  layouts_tf32_nt = [layouts_tf32[3]]
  tile_64x128x32_descriptions = [tile_descriptions[0], tile_descriptions[1], tile_descriptions[2]]
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32)
+  tile_128x128x32_descriptions = [tile_descriptions[3], tile_descriptions[4], tile_descriptions[5]]
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32, schedules)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_tf32, [schedules[1]])

-  # TMA kernels with TT layout use EpilogueTransposed, because swapping NN kernel and transposed its epilogue will get the kernel
  layouts_tf32_tt = [layouts_tf32[1]]
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32,
-    [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]])
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32, schedules_transposed_epilogue)

  # F32 kernel share same settings with tf32 I/O kernels excluding data type
  data_type_f32 = {
@ -4236,10 +4258,10 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
    "epi_type" : DataType.f32
  }

-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32)
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32)
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32,
-    [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]])
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32, schedules)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32, schedules)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_f32, [schedules[1]])
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32, schedules_transposed_epilogue)

 #
 def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
@ -4910,8 +4932,8 @@ def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
 #
 def GenerateSM90(manifest, cuda_version):
  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
-  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
  GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
  GenerateSM90_TensorOp_1684(manifest, cuda_version)
  GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
  GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@ -40,6 +40,7 @@ GeneratorTargetNames = {

 #
 class DataType(enum.Enum):
+  void = enum_auto()  # primarily used to disable C tensor for epilogues
  b1 = enum_auto()
  u4 = enum_auto()
  u8 = enum_auto()
@ -89,6 +90,7 @@ ShortDataTypeNames = {

 #
 DataTypeNames = {
+  DataType.void: "void",
  DataType.b1: "b1",
  DataType.u4: "u4",
  DataType.u8: "u8",
@ -121,10 +123,11 @@ DataTypeNames = {
  DataType.cs8: "cs8",
  DataType.cs16: "cs16",
  DataType.cs32: "cs32",
-  DataType.cs64: "cs64", 
+  DataType.cs64: "cs64",
 }

 DataTypeTag = {
+  DataType.void: "void",
  DataType.b1: "cutlass::uint1b_t",
  DataType.u4: "cutlass::uint4b_t",
  DataType.u8: "uint8_t",
@ -161,6 +164,7 @@ DataTypeTag = {
 }

 DataTypeSize = {
+  DataType.void: 0,
  DataType.b1: 1,
  DataType.u4: 4,
  DataType.u8: 8,
@ -765,6 +769,7 @@ class TileDescription:

  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1]):
    self.threadblock_shape = threadblock_shape
+    self.tile_shape = threadblock_shape
    self.stages = stages
    self.warp_count = warp_count
    self.math_instruction = math_instruction
--- a/tools/library/scripts/manifest.py
+++ b/tools/library/scripts/manifest.py
@ -240,7 +240,9 @@ class Manifest:
        self.kernel_filter_list = []
    else:
        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
-
+        _LOGGER.info("Using {filter_count} kernel filters from {filter_file}".format(
+            filter_count = len(self.kernel_filter_list), 
+            filter_file = args.kernel_filter_file))

    self.operation_count = 0
    self.operations_by_name = {}
@ -311,19 +313,29 @@ class Manifest:
      # compare against the include list
      for name_substr in self.kernel_names:
        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug("Kernel {kernel} included due to filter string '{filt}'.".format(
+            kernel = operation.procedural_name(),
+            filt = name_substr))
          enabled = True
          break

      # compare against the exclude list
      for name_substr in self.ignore_kernel_names:
        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug("Kernel {kernel} ignored due to filter string '{filt}'.".format(
+            kernel = operation.procedural_name(),
+            filt = name_substr))
          enabled = False
          break

    if len(self.kernel_filter_list) > 0:
-        enabled = False
        if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
-            enabled = True
+          _LOGGER.debug("Kernel {kernel} matched via kernel filter file.".format(kernel = operation.procedural_name()))
+          enabled = True
+        else:
+          _LOGGER.debug("Kernel {kernel} culled due to no match in kernel filter file.".format(kernel = operation.procedural_name()))
+          enabled = False
+

    # todo: filter based on compute data type
    return enabled
@ -389,6 +401,8 @@ class Manifest:
    for operation_kind, configurations in self.operations.items():
      with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
        for configuration_name, operations in configurations.items():
+          _LOGGER.info("Emitting {config} with {num_ops} operations.".format(
+              config = configuration_name, num_ops = len(operations)))
          operation_kind_emitter.emit(configuration_name, operations)

        source_files += operation_kind_emitter.source_files
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@ -64,6 +64,10 @@ namespace library {

 template <typename T> struct NumericTypeMap;

+template <> struct NumericTypeMap<void> {
+  static NumericTypeID const kId = NumericTypeID::kVoid;
+};
+
 template <> struct NumericTypeMap<cutlass::uint1b_t> {
  static NumericTypeID const kId = NumericTypeID::kB1;
 };
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --provid
 cutlass_add_executable_tests(
  test_profiler cutlass_profiler
  DEPENDEES test_all
-  TEST_COMMAND_OPTIONS 
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
+  TEST_COMMAND_OPTIONS
+    GEMM
+    CONV2D
+    CONV3D
+    SPGEMM
+    RANK_K
+    RANK_2K
+    TRMM
+    SYMM
+  TEST_COMMAND_OPTIONS_PREFIX
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
  DISABLE_EXECUTABLE_INSTALL_RULE
  )

--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@ -124,7 +124,7 @@ int CutlassProfiler::operator()() {
    options_.execution_mode == ExecutionMode::kTrace) {

    // Profiles all operations
-    profile_();
+    return profile_();
  }
  else if (options_.execution_mode == ExecutionMode::kEnumerate) {
    // Enumerates all operations
@ -157,7 +157,7 @@ int CutlassProfiler::profile_() {

      if (result) {
        return result;
-      } 
+      }
    }
  }

--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const {

 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_device(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
  cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
  if (result != cudaSuccess) {
    throw std::runtime_error("Failed device-to-device copy");
@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {

 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_host(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
  cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
  if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed host-to-device copy");
  }
 }

 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_to_host(void *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
  cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
  if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed device-to-host copy");
  }
 }

 void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
 }

 void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
  switch (type_) {
  case library::NumericTypeID::kFE4M3:
    cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
-      reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
+      reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
      capacity_,
      seed,
      dist
@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
    break;
  case library::NumericTypeID::kFE5M2:
    cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
-      reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
+      reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
      capacity_,
      seed,
      dist
@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
 }

 void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
 }

 void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
    throw std::runtime_error("Attempting to initialize invalid allocation.");
  }

--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
      {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse(
    return Status::kErrorInvalidProblem;
  }

+  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
  if (!arg_as_scalar(
    this->alpha, 
    operation_desc.element_epilogue, 
@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
  set_argument(result, "C", problem_space,
    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));

+  set_argument(result, "D", problem_space,
+    std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
+
  set_argument(result, "m", problem_space, m);
  set_argument(result, "n", problem_space, n);
  set_argument(result, "k", problem_space, k);
@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace(
      problem_.batch_count * gemm_workspace_.problem_count
    );

-    gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
-
    // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
    gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
    gemm_workspace_.arguments.batch_count = problem_.batch_count;
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@ -270,17 +270,17 @@ int OperationProfiler::profile_all(
  ProblemSpace::Iterator problem_it = problem_space.begin();
  ProblemSpace::Iterator problem_end = problem_space.end();

-  bool continue_profiling = true, internal_error = false;
+  bool continue_profiling = true;
+  int retval = 0;

  // For each problem in problem space
  for (; continue_profiling && problem_it != problem_end; ++problem_it) {
-
    ProblemSpace::Problem problem = problem_it.at();
-
    report.next_problem();

    // For each operation in manifest
-    for (auto const & operation_ptr : manifest) {
+    int matched_operation_count = 0;
+    for (auto const& operation_ptr : manifest) {

      library::Operation const *operation = operation_ptr.get();

@ -292,8 +292,8 @@ int OperationProfiler::profile_all(

      // Execute compatible cutlass operations if they satisfy the current device's compute capability
      if (operation->description().kind == kind_ &&
-        operation->description().provider == library::Provider::kCUTLASS &&
-        options.device.compute_capability() >= min_cc &&
+          operation->description().provider == library::Provider::kCUTLASS &&
+          options.device.compute_capability() >= min_cc &&
          options.device.compute_capability() <= max_cc) {

        std::string operation_name(operation->description().name);
@ -320,7 +320,10 @@ int OperationProfiler::profile_all(
        if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
          continue;
        }
-      
+
+        // we have found a kernel match, so increment the counter for match kernels
+        ++matched_operation_count;
+
        // A. Initialize configuration
        Status status = this->initialize_configuration(
          options,
@ -374,7 +377,6 @@ int OperationProfiler::profile_all(
        //

        // B. Verify CUTLASS
-         
        if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {

          continue_profiling = this->verify_cutlass(
@ -426,10 +428,18 @@ int OperationProfiler::profile_all(
      if (!continue_profiling) {
        break;
      }
-    } 
+    }
+
+    // If we did not find any kernels that match our filters and error_on_no_match was set, report an error
+    if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
+      #if !NDEBUG
+      std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
+      #endif
+      retval = 1;
+    }
  }

-  return internal_error ? 1 : 0;
+  return retval;
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline):
  }
  else if (cmdline.check_cmd_line_flag("kernels")) {
    cmdline.get_cmd_line_arguments("kernels", operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
  }

  if (cmdline.check_cmd_line_flag("ignore-kernels")) {
    cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
  }

  // Prevent launches on the device for anything other than CUTLASS operation
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@ -196,6 +196,9 @@ public:
    /// If true, profiling is actually conducted.
    bool enabled;

+    /// If true, profiling returns an error code if no kernels are found to match the filters.
+    bool error_on_no_match = false;
+
    /// List of providers of each functionality to be profiled
    ProviderVector providers;