diff --git a/CHANGELOG.md b/CHANGELOG.md
index 63e3e80e..fc269c8b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,7 +32,9 @@
 * CUTLASS library and profiler integration for block scaled data types for kernel emission, profiling, and verification.
   - Support for preferred and fallback cluster shapes via profiler command line arguments parsing to set dynamic cluster shapes.
   - Support for dynamic datatypes by parsing profiler via profiler command line arguments parsing to set dynamic datatype setting in TCGen05 MMA instruction descriptors.
+  - Support for mixed input GEMM kernels on Hopper in the profiler.
 * New CUTLASS profiler flag `use-cuda-graphs` to reduce overheads when benchmarking launch-bound kernels.
+* A new 3.x version of grouped GEMM to the CUTLASS library and generates kernels for Hopper and Blackwell. Now grouped GEMM support is enabled in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details).
 * Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM100 architecture:
   - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](./examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API.
   - GEMM with [opt-in collective builder schedules showcasing available recipes](./examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell.
@@ -45,12 +47,16 @@
   - Grouped GEMM for [vanilla FP8 data inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu).
   - Convolution kernels for [fprop](./examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](./examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](./examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu).
   - [Fused multi-head attention fprop kernel](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128.
+  - A new BF16x9 GEMM [kernel](./examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations.
+* Set of examples that demonstrate the usage of the 3.x API for targeting Hopper architecture:
+  - A set of new [Hopper grouped GEMM kernels](./examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes.
+  - A new [Hopper FP8 GEMM with groupwise scaling](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu).
 * Documentation updates:
   - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/quickstart.md#instantiating-a-blackwell-gemm-kernel).
   - Detailed [Blackwell block-scaled GEMM functionality documentation](./media/docs/blackwell_functionality.md)
   - A new [functionality documentation](./media/docs/functionality.md) specifically for 3.x API comprehensively documenting all supported kernel types, data types, kernel features, minimum CUDA tookit support etc for 3.x supported architectures.
   - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#Target-Architecture).
-  - Support grouped GEMM in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details).
+  - Updates to [profiler documentation](./media/docs/profiler.md) for testing mixed input GEMM kernels on Hopper.
 
 ## [3.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.7.0) (2025-01-11)
 - [Hopper blockwise scaling FP8 GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) uses 2D scaling tensor, assigning one value per threadblock.  This allows a finer-grained scaling to be applied for each output tile per gemm-k iteration. The operands and scaling tensors are loaded from global memory to shared memory using TMA and cp_async, respectively. The scaling is applied inside the mainloop.  Details with figures are [here](https://github.com/NVIDIA/cutlass/pull/1932#issue-2645398439).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9de4f96..0b68b435 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,14 +150,14 @@ set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUT
 set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")
 set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
 set(CUTLASS_USE_SYSTEM_GOOGLETEST OFF CACHE BOOL "Use system/external installation of GTest")
-set(CUTLASS_USE_PACKED_TUPLE ON CACHE BOOL "If ON, make cute::tuple be new standard-layout tuple type; if OFF, use the original cute::tuple implementation that is _not_ standard-layout.")
-if (CUTLASS_USE_PACKED_TUPLE)
-  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTE_USE_PACKED_TUPLE=1)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUTLASS_USE_PACKED_TUPLE=1")
-  message(STATUS "Make cute::tuple be the new standard-layout tuple type")
-elseif()
-  message(STATUS "Use the original cute::tuple implementation that is _not_ standard-layout")
+
+if (CUTLASS_ENABLE_TESTS AND CUTLASS_ENABLE_PROFILER)
+  set(CUTLASS_ENABLE_PROFILER_UNIT_TESTS_INIT ON)
+else()
+  set(CUTLASS_ENABLE_PROFILER_UNIT_TESTS_INIT OFF)
 endif()
+set(CUTLASS_ENABLE_PROFILER_UNIT_TESTS ${CUTLASS_ENABLE_PROFILER_UNIT_TESTS_INIT} CACHE BOOL "Enable CUTLASS Profiler-based Unit Tests")
+set(CUTLASS_ENABLE_SELF_CONTAINED_INCLUDES_CHECK ON CACHE BOOL "Enable CUTLASS check for self-contained header includes")
 
 ################################################################################
 
@@ -406,7 +406,7 @@ endif()
 
 # Warnings-as-error exceptions and warning suppressions for Clang builds
 if (CUTLASS_CLANG_HOST_COMPILE)
-  
+
   set(FLAGS_TO_ADD
     "-Wno-error=implicit-int-conversion"
     "-Wno-error=pass-failed"
@@ -414,13 +414,13 @@ if (CUTLASS_CLANG_HOST_COMPILE)
     "-Wno-sign-conversion"
     "-Wno-unused-parameter"
   )
-  
+
   foreach(FLAG ${FLAGS_TO_ADD})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
     list(APPEND CUTLASS_CUDA_NVCC_FLAGS "${FLAG}")
     list(APPEND CUTLASS_CUDA_CLANG_FLAGS "${FLAG}")
   endforeach()
-  
+
 endif()
 
 if (NOT MSVC AND CUTLASS_NVCC_KEEP)
@@ -486,7 +486,7 @@ if (CUTLASS_CLANG_DEVICE_COMPILE)
 
   link_libraries(nvidia::cudart)
   link_libraries(nvidia::cuda_driver)
-  
+
 endif()
 
 #Report CUDA build flags
@@ -561,7 +561,7 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
       list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-real)
     endif()
     if(CUTLASS_NVCC_EMBED_PTX AND NOT CUTLASS_CLANG_DEVICE_COMPILE)
-      # If we're using clang for device compilation, the ptx is inserted 
+      # If we're using clang for device compilation, the ptx is inserted
       # via another command line option and the `-virtual` flags will cause an error.
       list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-virtual)
     endif()
@@ -922,7 +922,7 @@ function(cutlass_add_executable_tests NAME TARGET)
     if (NOT __DO_NOT_LOWERCASE_TEST_NAME)
       string(TOLOWER "${TESTCASE_NAME}" TESTCASE_NAME)
     endif()
-    
+
     # The following rigmarole is needed to deal with spaces and possible quotes in
     # command line arguments. The options are passed "by reference" as the actual
     # variable names holding the real options. We then expand these in a way that
@@ -1007,46 +1007,51 @@ function(cutlass_generate_profiler_tests NAME)
   endif()
 
   file(STRINGS ${CUTLASS_PROFILER_REGRESSION_LIST_FILE} TEST_LIST)
-
   foreach(TEST IN LISTS TEST_LIST)
-
+    set(TEMP_TEST ${TEST})
     if ("${TEST}" MATCHES " *cutlass_profiler.*")
 
-    # Generate a flattened name for the test from the test command line.
-    string(REPLACE "," ";" TEST_NAME_LIST ${TEST})
-    list(GET TEST_NAME_LIST 0 TEST)
-    string(REGEX MATCHALL "[a-zA-Z0-9_=]+" TEST_NAME "${TEST}")
-      list(FILTER TEST_NAME EXCLUDE REGEX "cutlass_profiler|mode=trace|providers=cutlass")
-    list(JOIN TEST_NAME "_" TEST_NAME)
-    string(REGEX REPLACE "_verification_required=(true|false)" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "_verification_providers=device" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "batch_count=" "batch" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "cluster_m=" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "_cluster_n=" "x" TEST_NAME "${TEST_NAME}")
-    string(REGEX REPLACE "_cluster_k=[0-9]+" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "cluster_m_fallback=" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "_cluster_n_fallback=" "x" TEST_NAME "${TEST_NAME}")
-    string(REGEX REPLACE "_cluster_k_fallback=[0-9]+" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "runtime_input_datatype_a=" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "runtime_input_datatype_b=" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "=" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "_error_on_no_match" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "_error_if_nothing_is_profiled" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "kernels" "" TEST_NAME "${TEST_NAME}")
-    string(REPLACE "operation" "" TEST_NAME "${TEST_NAME}")
+      # Generate a flattened name for the test from the test command line.
+      string(REPLACE "," ";" TEST_NAME_LIST ${TEMP_TEST})
+      string(REGEX REPLACE "\\*" "_" TEST_NAME "${TEMP_TEST}")
+      string(REGEX REPLACE "\\\"\\{\\\"\\\"input_params.*\\{.*\\}\\}\\\"" "" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "\\\"\\{\\\"\\\"input_params.*\\{.*\\}\\}\\\"" "" TEST "${TEST}")
+      string(REGEX REPLACE "," ";" TEST "${TEST}")
+      string(REGEX MATCHALL "[a-zA-Z0-9_=]+" TEST_NAME "${TEST_NAME}")
+        list(FILTER TEST_NAME EXCLUDE REGEX "cutlass_profiler|mode=trace|providers=cutlass")
+      list(JOIN TEST_NAME "_" TEST_NAME)
+      string(REGEX REPLACE "_verification_required=(true|false)" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "_verification_providers=device" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "batch_count=" "batch" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "cluster_m=" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "_cluster_n=" "x" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "_cluster_k=[0-9]+" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "cluster_m_fallback=" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "_cluster_n_fallback=" "x" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "_cluster_k_fallback=[0-9]+" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "runtime_input_datatype_a=" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "runtime_input_datatype_b=" "" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "verification_enabled=(true|false)" "" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "warmup_iterations=[0-9]+" "" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "profiling_iterations=[0-9]+" "" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "sleep_duration=[0-9]+" "" TEST_NAME "${TEST_NAME}")
+      string(REGEX REPLACE "profiling_enabled=(true|false)" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "=" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "_error_on_no_match" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "_error_if_nothing_is_profiled" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "kernels" "" TEST_NAME "${TEST_NAME}")
+      string(REPLACE "operation" "" TEST_NAME "${TEST_NAME}")
 
-    if (__DO_NOT_LOWERCASE_TEST_NAME)
-      string(TEST_NAME_LOWER "${TEST_NAME}")
-    else()      
-      string(TOLOWER "${TEST_NAME}" TEST_NAME_LOWER)
-    endif()
+      if (NOT __DO_NOT_LOWERCASE_TEST_NAME)
+        string(TOLOWER "${TEST_NAME}" TEST_NAME)
+      endif()
 
-    # Munge the test command
-    string(REPLACE "cutlass_profiler" "" TEST "${TEST}")
-    set(TEST "${TEST}" ${__CUTLASS_PROFILER_EXTRA_OPTIONS} "--junit-output=${TEST_NAME_LOWER}")
-    set(TEST_COMMAND_${TEST_NAME_LOWER} "${TEST}")
-    list(APPEND TEST_COMMAND_VARS ${TEST_NAME_LOWER})
+      # Munge the test command
 
+      string(REPLACE "cutlass_profiler" "" TEST "${TEST}")
+      set(TEST "${TEST}" ${__CUTLASS_PROFILER_EXTRA_OPTIONS} "--junit-output=${TEST_NAME}")
+      set(TEST_COMMAND_${TEST_NAME} "${TEST}")
+      list(APPEND TEST_COMMAND_VARS ${TEST_NAME})
     endif()
 
   endforeach()
@@ -1084,6 +1089,14 @@ if (CUTLASS_ENABLE_TESTS)
   if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
   add_dependencies(test_all test_unit)
   endif()
+  if (CUTLASS_ENABLE_PROFILER_UNIT_TESTS AND CUTLASS_BUILD_FOR_PROFILER_REGRESSIONS)
+    # Generate profiler based unit test
+    cutlass_generate_profiler_tests(
+      tup
+      DEPENDEES test_unit
+    )
+  endif()
+
 endif()
 
 if (CUTLASS_INSTALL_TESTS)
diff --git a/ACTIVE_DEVELOPERS.md b/CONTRIBUTORS.md
similarity index 96%
rename from ACTIVE_DEVELOPERS.md
rename to CONTRIBUTORS.md
index 6ae47b43..1ef06a36 100644
--- a/ACTIVE_DEVELOPERS.md
+++ b/CONTRIBUTORS.md
@@ -27,7 +27,7 @@ Siyu Liu<br />
 Richard Cai<br />
 Vikas Gupta<br />
 Ethan Yan<br />
-Vijay Thakkar (CUTLASS 3.x founding member)<br />
+Vijay Thakkar (CUTLASS 3.x and CuTe founding member)<br />
 Cris Cecka (CuTe and CUTLASS 3.x founding member)<br />
 Lawrence Ryan<br />
 Qun Song<br />
diff --git a/README.md b/README.md
index f9f23c08..ada18b39 100644
--- a/README.md
+++ b/README.md
@@ -43,23 +43,23 @@ architecture.
 CUTLASS 3.8 is the first release that supports the NVIDIA Blackwell SM100 architecture.
 For a background on Blackwell's new features, please consult the PTX documentation for CUDA 12.8.
 
-* Support for new CuTe building blocks specifically for Blackwell architecture:
+* Support for new CuTe building blocks specifically for Blackwell SM100 architecture:
   - [5th generation Blackwell Tensor Core instructions (TCGen05)](./include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms.
   - Extensions to [Tensor Memory Accelerator](./include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms.
-  - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](./include/cute/pointer.hpp#L290) across CuTe as a first class data locale.
+  - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](./include/cute/pointer.hpp) across CuTe as a first class data locale.
   - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](./include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe.
   - [`make_tmem_copy()`](./include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms.
   - Support for [new variants of LDSM on Blackwell](./include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms.
-* Support for new CUTLASS building blocks specifically for Blackwell architecture:
+* Support for new CUTLASS building blocks specifically for Blackwell SM100 architecture:
   - Various narrow precision [FP4, FP6, and FP8](./include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](./include/cutlass/float_subbyte.h)
   - [Pipelines that implement Blackwell specific synchronization](./include/cutlass/pipeline/sm100_pipeline.hpp).
   - [Cluster launch control API supporting preferred and fallback cluster shapes](./include/cutlass/cluster_launch.hpp).
   - Data types including NVFP4, MXFP4, MXFP6, and MXFP8 and all their supported element and scale factor types.
   - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](./include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](./include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp).
   - Extensions to testbeds and reference check code for unit tests and CUTLASS profiler.
-* Full support for Blackwell kernels in CUTLASS 3.x API:
+* Full support for Blackwell SM100 kernels in CUTLASS 3.x API:
   - [Blackwell specific kernel layers](./include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that
-    + Implement a new warp-specialization recipe tuned specifically for Blackwell.
+    + Implement a new warp-specialization recipe tuned specifically for Blackwell SM100 architecture.
     + Leverage all the new features such as CLC based tile scheduling, preferred cluster, and TMEM based double buffering of accumulators.
     + Support stream-K load balancing for all kernel types everywhere via composable scheduler support.
   - Blackwell collective mainloops that target the TCGen05 MMA instructions (both SS and TS) for
@@ -73,7 +73,10 @@ For a background on Blackwell's new features, please consult the PTX documentati
 * CUTLASS library and profiler integration for block scaled data types for kernel emission, profiling, and verification.
   - Support for preferred and fallback cluster shapes via profiler command line arguments parsing to set dynamic cluster shapes.
   - Support for dynamic datatypes by parsing profiler via profiler command line arguments parsing to set dynamic datatype setting in TCGen05 MMA instruction descriptors.
-* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell
+  - Support for mixed input GEMM kernels on Hopper in the profiler.
+* New CUTLASS profiler flag `use-cuda-graphs` to reduce overheads when benchmarking launch-bound kernels.
+* A new 3.x version of grouped GEMM to the CUTLASS library and generates kernels for Hopper and Blackwell. Now grouped GEMM support is enabled in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details).
+* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM100 architecture:
   - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](./examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API.
   - GEMM with [opt-in collective builder schedules showcasing available recipes](./examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell.
   - Block scaled data type GEMMs targeting Blackwell's native block scaled Tensor Cores:
@@ -85,6 +88,10 @@ For a background on Blackwell's new features, please consult the PTX documentati
   - Grouped GEMM for [vanilla FP8 data inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu).
   - Convolution kernels for [fprop](./examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](./examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](./examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu).
   - [Fused multi-head attention fprop kernel](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128.
+  - A new BF16x9 GEMM [kernel](./examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations.
+* Set of examples that demonstrate the usage of the 3.x API for targeting Hopper architecture:
+  - A set of new [Hopper grouped GEMM kernels](./examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes.
+  - A new [Hopper FP8 GEMM with groupwise scaling](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu).
 * Documentation updates:
   - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/quickstart.md#instantiating-a-blackwell-gemm-kernel).
   - Detailed [Blackwell block-scaled GEMM functionality documentation](./media/docs/blackwell_functionality.md)
diff --git a/customConfigs.cmake b/customConfigs.cmake
index c86e15be..e39212db 100644
--- a/customConfigs.cmake
+++ b/customConfigs.cmake
@@ -47,7 +47,7 @@ function(cutlass_generate_kernel_filter_and_testlists_files)
     COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CUTLASS_LIBRARY_PACKAGE_DIR}
       ${Python3_EXECUTABLE} ${CUTLASS_SOURCE_DIR}/python/cutlass_library/generator.py 
       --generator-target=${__TEST_SET_NAME} 
-      --cuda-version=${CUTLASS_GENERATOR_CUDA_COMPILER_VERSION}
+      --cuda-version=${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}
       --architectures=${CUTLASS_NVCC_ARCHS}
       --kernels=\*
       --disable-cutlass-package-imports
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
index 0c407d34..74aa3614 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -45,7 +45,7 @@
     CTA rasterization direction and swizzle pattern impact cross-CTA locality of accesses. By tuning we can
     improve performance.
     Examples:
-      $ ./examples/64_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling/64_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling  \
+      $ ./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling  \
         --m=2816 --n=3072 --k=16384 \
         --save_aux=false --save_amax=false \
         --device_scale=false --raster=h --swizzle=2
@@ -119,22 +119,56 @@ using         ElementBias  = float;
 using ElementAccumulator  = float;                                          // Element type for internal accumulation
 using ElementBlockScale   = float;                                          // Element type for blockscaling during accumulation
 using ElementCompute      = float;                                          // Element type for epilogue computation
-using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
-using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
-using TileShape           = Shape<_128,_128,_128>;                           // Threadblock-level tile size
-using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
 
-constexpr int ScaleMsPerTile = 2;
-constexpr int ScaleGranularityM = size<0>(TileShape{}) / ScaleMsPerTile;
+using TileShape_  = Shape<_128,_128,_128>;  // This one is just to make the compiler happy with verify()...
 
-using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM>;
-using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecializedCooperative;
+// ScaleGranularity{M,N}: number of {rows in A}/{columns in B} that share the same scaling factor
+// Given TileShape = Shape<_128,_128,_128>:
+//   ScaleGranularityM == 128 and ScaleGranularityN == 128 --> 2Dx2D (the shape of the scaling factor)
+//   ScaleGranularityM == 1   and ScaleGranularityN == 128 --> 1Dx2D scaling
+//   ScaleGranularityM == 128 and ScaleGranularityN == 1   --> 2Dx1D scaling
+//   ScaleGranularityM == 1   and ScaleGranularityN == 1   --> 1Dx1D scaling
+template <int ScaleGranularityM_, int ScaleGranularityN_>
+struct GroupScaleConfig {
+  using ArchTag       = cutlass::arch::Sm90;                          // Tag indicating the minimum SM that supports the intended feature
+  using OperatorClass = cutlass::arch::OpClassTensorOp;               // Operator class tag
+  using TileShape     = Shape<_128,_128,_128>;                        // Threadblock-level tile size
+  using ClusterShape  = Shape<_1,_2,_1>;                              // Shape of the threadblocks in a cluster
 
-using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
-using FusionOperation     = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+  static constexpr int ScaleGranularityM = ScaleGranularityM_;
+  static constexpr int ScaleGranularityN = ScaleGranularityN_;
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+
+  static_assert(size<0>(TileShape{}) == ScaleGranularityM * ScaleMsPerTile,
+              "FP8 scaling granularity must evenly divide tile shape along M.");
+  static_assert(size<1>(TileShape{}) == ScaleGranularityN * ScaleNsPerTile,
+              "FP8 scaling granularity must evenly divide tile shape along N.");
+
+  using KernelSchedule    = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_, ScaleGranularityN_>;
+  using EpilogueSchedule  = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType  = cutlass::epilogue::collective::EpilogueTileAuto;
+  using FusionOperation   = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
     LayoutAux, cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementC>;
+};
 
-using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+using GroupScale1D1DConfig = GroupScaleConfig<                    1,                     1>;
+using GroupScale1D2DConfig = GroupScaleConfig<                    1, size<1>(TileShape_{})>;
+using GroupScale2D1DConfig = GroupScaleConfig<size<0>(TileShape_{}),                     1>;
+using GroupScale2D2DConfig = GroupScaleConfig<size<0>(TileShape_{}), size<1>(TileShape_{})>;
+
+template <typename ScheduleConfig>
+struct GroupScaleGemm {
+  using ArchTag           = typename ScheduleConfig::ArchTag;
+  using OperatorClass     = typename ScheduleConfig::OperatorClass;
+  using TileShape         = typename ScheduleConfig::TileShape;
+  using ClusterShape      = typename ScheduleConfig::ClusterShape;
+  using KernelSchedule    = typename ScheduleConfig::KernelSchedule;
+  using EpilogueSchedule  = typename ScheduleConfig::EpilogueSchedule;
+  using EpilogueTileType  = typename ScheduleConfig::EpilogueTileType;
+  using FusionOperation   = typename ScheduleConfig::FusionOperation;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
     TileShape, ClusterShape,
     EpilogueTileType,
@@ -145,7 +179,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
     FusionOperation
   >::CollectiveOp;
 
-using CollectiveMainloopWithBlockWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
+  using CollectiveMainloopWithGroupWiseScaling = typename cutlass::gemm::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
@@ -157,24 +191,38 @@ using CollectiveMainloopWithBlockWiseScaling = typename cutlass::gemm::collectiv
     KernelSchedule
   >::CollectiveOp;
 
-using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-    Shape<int,int,int,int>, // Indicates ProblemShape
-    CollectiveMainloopWithBlockWiseScaling,
-    CollectiveEpilogue
->;
+  using GemmKernelDefault = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloopWithGroupWiseScaling,
+      CollectiveEpilogue
+  >;
 
-using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using GemmKernelStreamK = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloopWithGroupWiseScaling,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using GemmDefault = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelDefault>;
+  using GemmStreamK = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelStreamK>;
+};
+
+using GroupScale1D1DGemm = GroupScaleGemm<GroupScale1D1DConfig>;
+using GroupScale1D2DGemm = GroupScaleGemm<GroupScale1D2DConfig>;
+using GroupScale2D1DGemm = GroupScaleGemm<GroupScale2D1DConfig>;
+using GroupScale2D2DGemm = GroupScaleGemm<GroupScale2D2DConfig>;
 
 // Extract information from Gemm kernel.
-using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
+using EpilogueOutputOp  = typename GroupScale1D1DGemm::GemmDefault::EpilogueOutputOp;
 using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
 using ElementAmax       = typename EpilogueOutputOp::ElementAmax;
 using ActivationFunctor = typename EpilogueOutputOp::ActivationFn;
 
-using StrideA = typename Gemm::GemmKernel::StrideA;
-using StrideB = typename Gemm::GemmKernel::StrideB;
-using StrideC = typename Gemm::GemmKernel::StrideC;
-using StrideD = typename Gemm::GemmKernel::StrideD;
+using StrideA = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideA;
+using StrideB = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideB;
+using StrideC = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideC;
+using StrideD = typename GroupScale1D1DGemm::GemmDefault::GemmKernel::StrideD;
 using StrideAux = StrideD;
 
 constexpr bool IsDFp8 =
@@ -185,9 +233,6 @@ constexpr bool IsAuxFp8 =
     cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
     cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
 
-static_assert(size<0>(TileShape{}) == ScaleGranularityM * ScaleMsPerTile,
-             "FP8 scaling granularity must evenly divide tile shape along M.");
-
 static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
              "ElementAccumulator and ElementBlockScale should be same datatype");
 
@@ -347,13 +392,18 @@ struct Result
   }
 
 /// Initialize operands to be used in the GEMM and reference GEMM
+template <typename GroupScaleConfig>
 void initialize(const Options<RasterOrderOptions> &options) {
 
+  using TileShape = typename GroupScaleConfig::TileShape;
+  const int ScaleMsPerTile = GroupScaleConfig::ScaleMsPerTile;
+  const int ScaleNsPerTile = GroupScaleConfig::ScaleNsPerTile;
+
   // Find Group Scaling tensor shapes based on `ScaleGranularityM`, problem shape, and TileShape
   auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
   auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
   auto groupscale_m = cute::get<0>(blockscale_shape) * ScaleMsPerTile; // We need to pad along M in scale tensor of A to prevent illegal memory access.
-  auto blockscale_n = cute::get<1>(blockscale_shape);
+  auto groupscale_n = cute::get<1>(blockscale_shape) * ScaleNsPerTile; // We need to pad along N in scale tensor of A to prevent illegal memory access.
   auto blockscale_k = cute::get<2>(blockscale_shape);
 
   stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
@@ -362,18 +412,16 @@ void initialize(const Options<RasterOrderOptions> &options) {
   stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
   stride_aux = stride_D;
 
-
-
   auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
   auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
   auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
   auto groupscale_a_coord = cutlass::make_Coord(groupscale_m * options.l, blockscale_k);
-  auto blockscale_b_coord = cutlass::make_Coord(blockscale_k, blockscale_n * options.l);
+  auto groupscale_b_coord = cutlass::make_Coord(groupscale_n * options.l, blockscale_k);
 
   tensor_A.resize(a_coord);
-  blockscale_tensor_A.resize(groupscale_a_coord);
   tensor_B.resize(b_coord);
-  blockscale_tensor_B.resize(blockscale_b_coord);
+  blockscale_tensor_A.resize(groupscale_a_coord);
+  blockscale_tensor_B.resize(groupscale_b_coord);
   tensor_C.resize(c_coord);
   tensor_D.resize(c_coord);
   tensor_ref_D.resize(c_coord);
@@ -393,7 +441,7 @@ void initialize(const Options<RasterOrderOptions> &options) {
 #if 0 // Dump blockscaled tensors
   std::cout << "blockscale_tensor_A: " << groupscale_a_coord << std::endl;
   std::cout << blockscale_tensor_A.host_view() << "\n";
-  std::cout << "blockscale_tensor_B: " << blockscale_b_coord << std::endl;
+  std::cout << "blockscale_tensor_B: " << groupscale_b_coord << std::endl;
   std::cout << blockscale_tensor_B.host_view() << "\n";
 #endif
 
@@ -441,21 +489,26 @@ void initialize(const Options<RasterOrderOptions> &options) {
 
   if (IsDFp8 && options.save_amax) {
     abs_max_D.resize(cutlass::make_Coord(1));
+    initialize_tensor(abs_max_D.host_view(), cutlass::Distribution::AllZeros, 0);
     abs_max_D.sync_device();
     reference_abs_max_D.resize(cutlass::make_Coord(1));
+    initialize_tensor(reference_abs_max_D.host_view(), cutlass::Distribution::AllZeros, 0);
   }
 
   if (IsAuxFp8 && options.save_aux && options.save_amax) {
     abs_max_aux.resize(cutlass::make_Coord(1));
+    initialize_tensor(abs_max_aux.host_view(), cutlass::Distribution::AllZeros, 0);
     abs_max_aux.sync_device();
     reference_abs_max_aux.resize(cutlass::make_Coord(1));
+    initialize_tensor(reference_abs_max_aux.host_view(), cutlass::Distribution::AllZeros, 0);
   }
 }
 
 /// Populates a Gemm::Arguments structure from the given commandline options
-typename Gemm::Arguments args_from_options(const Options<RasterOrderOptions> &options)
+template<typename GemmArguments>
+GemmArguments args_from_options(const Options<RasterOrderOptions> &options)
 {
-  typename Gemm::Arguments arguments{
+  GemmArguments arguments{
     cutlass::gemm::GemmUniversalMode::kGemm,
     {options.m, options.n, options.k, options.l},
     {tensor_A.device_data(),
@@ -513,14 +566,15 @@ typename Gemm::Arguments args_from_options(const Options<RasterOrderOptions> &op
   return arguments;
 }
 
-bool verify(const Options<RasterOrderOptions> &options) {
+/// Don't know why the compiler does not like verify() being templated...
+bool verify(const Options<RasterOrderOptions> &options, const int ScaleMsPerTile, const int ScaleNsPerTile) {
   //
   // Compute reference output
   //
 
   // Group scaling tensors shapes based `ScaleGranularityM`, CTA Block (TileShape) and GEMM Problem shape
   auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
-  auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape{})));
+  auto blockscale_shape = shape(get<1>(cute::zipped_divide(cute::make_layout(gemm_problem_shape), TileShape_{})));
   auto blockscale_m = cute::get<0>(blockscale_shape);
   auto blockscale_n = cute::get<1>(blockscale_shape);
   auto blockscale_k = cute::get<2>(blockscale_shape);
@@ -565,8 +619,8 @@ bool verify(const Options<RasterOrderOptions> &options) {
                                       );
   auto blockscale_B = cute::make_tensor(blockscale_tensor_B.host_data(),
                                         cute::make_layout(
-                                          cute::make_shape(blockscale_n, blockscale_k, options.l),
-                                          cute::make_stride(blockscale_k, 1, blockscale_n * blockscale_k)
+                                          cute::make_shape(blockscale_n, ScaleNsPerTile, blockscale_k, options.l),
+                                          cute::make_stride(blockscale_k * ScaleNsPerTile, 1, ScaleNsPerTile, blockscale_n * blockscale_k * ScaleNsPerTile)
                                         )
                                       );
 
@@ -575,7 +629,7 @@ bool verify(const Options<RasterOrderOptions> &options) {
   cutlass::reference::host::GettMainloopParams<ElementAccumulator,
                                                decltype(A), decltype(B),
                                                decltype(blockscale_A), decltype(blockscale_B),
-                                               TileShape> mainloop_params{
+                                               TileShape_> mainloop_params{
                                                A, B,                         // Operand Tensors
                                                blockscale_A, blockscale_B    // Groupwise scaling Tensors
                                               };
@@ -641,16 +695,22 @@ bool verify(const Options<RasterOrderOptions> &options) {
 }
 
 /// Execute a given example GEMM computation
-template <typename Gemm>
+template <typename GroupScaleConfig, typename Gemm>
 int run(Options<RasterOrderOptions> &options)
 {
-  initialize(options);
+  using TileShape = typename GroupScaleConfig::TileShape;
+  const int ScaleGranularityM = GroupScaleConfig::ScaleGranularityM;
+  const int ScaleGranularityN = GroupScaleConfig::ScaleGranularityN;
+  const int ScaleMsPerTile    = GroupScaleConfig::ScaleMsPerTile;
+  const int ScaleNsPerTile    = GroupScaleConfig::ScaleNsPerTile;
+
+  initialize<GroupScaleConfig>(options);
 
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm;
 
   // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
-  auto arguments = args_from_options(options);
+  auto arguments = args_from_options<typename Gemm::Arguments>(options);
 
   // Using the arguments, query for extra workspace required for matrix multiplication computation
   size_t workspace_size = Gemm::get_workspace_size(arguments);
@@ -669,7 +729,7 @@ int run(Options<RasterOrderOptions> &options)
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
   Result result;
-  result.passed = verify(options);
+  result.passed = verify(options, ScaleMsPerTile, ScaleNsPerTile);
 
   std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
 
@@ -683,6 +743,7 @@ int run(Options<RasterOrderOptions> &options)
     GpuTimer timer;
     timer.start();
     for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
       CUTLASS_CHECK(gemm.run());
     }
     timer.stop();
@@ -702,9 +763,13 @@ int run(Options<RasterOrderOptions> &options)
     }
 
     std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+    std::cout << "  Tile shape (M, N, K): " << size<0>(TileShape{}) << ", " << size<1>(TileShape{}) << ", " << size<2>(TileShape{}) << std::endl;
+    std::cout << "  ScaleGranularityM: " << ScaleGranularityM << " (ScaleMsPerTile: " << ScaleMsPerTile << ")" << std::endl;
+    std::cout << "  ScaleGranularityN: " << ScaleGranularityN << " (ScaleNsPerTile: " << ScaleNsPerTile << ")" << std::endl;
     std::cout << "  Rasterization: " << raster << " with a maximum CTA swizzle of " << options.swizzle << std::endl;
     std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
     std::cout << "  GFLOPS: " << result.gflops << std::endl;
+    fflush(stdout);
   }
 
   return 0;
@@ -753,7 +818,27 @@ int main(int argc, char const **args) {
   //
 
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
-  run<Gemm>(options);
+  std::cout << "Basic split-K GEMM kernel" << std::endl;
+  run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmDefault>(options);
+  std::cout << std::endl;
+  run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmDefault>(options);
+  std::cout << std::endl;
+  run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmDefault>(options);
+  std::cout << std::endl;
+  run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmDefault>(options);
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+
+  std::cout << "StreamK GEMM kernel" << std::endl;
+  run<GroupScale1D1DConfig, GroupScale1D1DGemm::GemmStreamK>(options);
+  std::cout << std::endl;
+  run<GroupScale1D2DConfig, GroupScale1D2DGemm::GemmStreamK>(options);
+  std::cout << std::endl;
+  run<GroupScale2D1DConfig, GroupScale2D1DGemm::GemmStreamK>(options);
+  std::cout << std::endl;
+  run<GroupScale2D2DConfig, GroupScale2D2DGemm::GemmStreamK>(options);
+  std::cout << std::endl;
 #endif
 
   return 0;
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
index cb3ff022..e9809f6b 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_groupwise_scaling.h
@@ -220,10 +220,12 @@ void gett_mainloop(
   int64_t block_m = m / kBlockM;
   int64_t block_n = n / kBlockN;
   cute::Tensor blockscale_A = mainloop_params.ScaleA(block_m, _, _, l);
-  cute::Tensor blockscale_B = mainloop_params.ScaleB(block_n, _, l);
+  cute::Tensor blockscale_B = mainloop_params.ScaleB(block_n, _, _, l);
 
   const int ScaleGranularityM = cute::size<0>(typename MainloopParams::TileShape{}) / cute::size<1>(mainloop_params.ScaleA.shape());
-  assert(cute::size<0>(typename MainloopParams::TileShape{}) == ScaleGranularityM * cute::size<1>(mainloop_params.ScaleA.shape())); 
+  const int ScaleGranularityN = cute::size<1>(typename MainloopParams::TileShape{}) / cute::size<1>(mainloop_params.ScaleB.shape());
+  assert(cute::size<0>(typename MainloopParams::TileShape{}) == ScaleGranularityM * cute::size<1>(mainloop_params.ScaleA.shape()));
+  assert(cute::size<1>(typename MainloopParams::TileShape{}) == ScaleGranularityN * cute::size<1>(mainloop_params.ScaleB.shape()));
 
   // Compute on this k-block
   for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
@@ -231,7 +233,7 @@ void gett_mainloop(
     // Load Blockwise scaling factor from blockscale Tensors for B
     int64_t block_k = k / kBlockK;
     cute::Tensor scale_a = blockscale_A(_, block_k);
-    ElementBlockScaleB scale_b = blockscale_B[block_k];
+    cute::Tensor scale_b = blockscale_B(_, block_k);
 
     // Load A
     ElementAccumulator a_frag[kBlockM];
@@ -268,8 +270,10 @@ void gett_mainloop(
     // (c) Update permanent (accu)
     if ((k+1) % kBlockK == 0) {
       for (int m_b = 0; m_b < kBlockM; ++m_b) {
+        auto scale_a_m_b = scale_a[m_b / ScaleGranularityM];
         for (int n_b = 0; n_b < kBlockN; ++n_b) {
-          ElementAccumulator blockwise_scaled_accum = acc_temp[m_b][n_b] * scale_a[m_b / ScaleGranularityM] * scale_b;
+          auto scale_b_n_b = scale_b[n_b / ScaleGranularityN];
+          ElementAccumulator blockwise_scaled_accum = acc_temp[m_b][n_b] * scale_a_m_b * scale_b_n_b;
           acc[m_b][n_b] = blockwise_scaled_accum + acc[m_b][n_b];
           acc_temp[m_b][n_b] = ElementAccumulator(0); 
         }
diff --git a/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_bf16_grouped_gemm.cu b/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_bf16_grouped_gemm.cu
index b22d8305..c1978c32 100644
--- a/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_bf16_grouped_gemm.cu
+++ b/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_bf16_grouped_gemm.cu
@@ -32,7 +32,19 @@
 
 /*! \file
     \brief 
-    NOTE: Write docu
+    Hopper Mixed-input Grouped GEMM example using CUTLASS 3 APIs for NVIDIA Hopper architecture.
+    See 55_hopper_int4_bf16_gemm.cu for more details about W4A16 GEMMs with layout shuffling.
+
+    Limitations:
+      1) Only support row-wise scaling. Zero-points and block-wise scaling is currently not supported.
+
+    To run this example:
+
+      $ ./examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_bf16_grouped_gemm --m=2048 --n=2048 --k=2048 --mode=1 --groups=10
+
+      The above example command makes all 10 groups to be sized at the given m, n, k sizes.
+      Skipping any of the problem dimensions randomizes it across the different groups.
+      Same applies for alpha and beta values that are randomized across the different groups.
 */
 
 #include <iostream>
diff --git a/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_fp8_grouped_gemm.cu b/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_fp8_grouped_gemm.cu
index cc0494ec..07ff66b3 100644
--- a/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_fp8_grouped_gemm.cu
+++ b/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_fp8_grouped_gemm.cu
@@ -32,7 +32,19 @@
 
 /*! \file
     \brief 
-    NOTE: Write docu
+    Hopper Mixed-input Grouped GEMM example using CUTLASS 3 APIs for NVIDIA Hopper architecture. 
+    See 55_hopper_int4_fp8_gemm.cu for more details about W4A8 GEMMs with lookup table.
+
+    Limitations:
+      1) Only support row-wise scaling. Zero-points and block-wise scaling is currently not supported.
+
+    To run this example:
+
+      $ ./examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_int4_fp8_grouped_gemm --m=2048 --n=2048 --k=2048 --mode=1 --groups=10
+
+      The above example command makes all 10 groups to be sized at the given m, n, k sizes.
+      Skipping any of the problem dimensions randomizes it across the different groups.
+      Same applies for alpha and beta values that are randomized across the different groups.
 */
 
 #include <iostream>
diff --git a/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_mixed_dtype_grouped_gemm.cu b/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_mixed_dtype_grouped_gemm.cu
index 883d8cbf..ffeb233e 100644
--- a/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_mixed_dtype_grouped_gemm.cu
+++ b/examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_mixed_dtype_grouped_gemm.cu
@@ -31,7 +31,19 @@
 
 /*! \file
     \brief 
-    NOTE: Write docu
+    Hopper Mixed-input Grouped GEMM example using CUTLASS 3 APIs for NVIDIA Hopper architecture. 
+    See 55_hopper_mixed_dtype_gemm.cu for more details about Mixed-input GEMMs.
+
+    Limitations:
+      1) Only support row-wise scaling. Zero-points and block-wise scaling is currently not supported.
+
+    To run this example:
+
+      $ ./examples/69_hopper_mixed_dtype_grouped_gemm/69_hopper_mixed_dtype_grouped_gemm --m=2048 --n=2048 --k=2048 --mode=1 --groups=10
+
+      The above example command makes all 10 groups to be sized at the given m, n, k sizes.
+      Skipping any of the problem dimensions randomizes it across the different groups.
+      Same applies for alpha and beta values that are randomized across the different groups.
 */
 
 #include <iostream>
diff --git a/examples/69_hopper_mixed_dtype_grouped_gemm/README.md b/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
new file mode 100644
index 00000000..272d36e5
--- /dev/null
+++ b/examples/69_hopper_mixed_dtype_grouped_gemm/README.md
@@ -0,0 +1,14 @@
+This example extends Example 55 to support Grouped GEMMs in CUTLASS.
+
+## High level overview
+
+This example shows how to perform Grouped GEMMs on Hopper when A and B have different types. In the Grouped GEMM, multiple GEMMs with potentially different problem shapes can be excetued in a batch. The interface is similar to the standard mixed-input GEMM presented in Example 55, with a few noteworthy differences:
+- inside the collective builder, replace the layout types with layout pointer types.
+- in the arguments, pass the group size, array of the problem sizes, and the array of strides for matrix A and B.
+- if scales and zero-points are included, also pass the array of their strides in the arguments.
+
+Note that in Example 55, the argument `--g` is used to determine the block scale size. It is important not to confuse this with the `--groups` argument in this example, which specifies the number of GEMMs.
+
+## Upcoming features
+
+Currently, the Mixed-input Grouped GEMM only supports row-wise scaling. Please contact us if zero-points or block-wise scaling are needed.
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
index 39123cac..3cee6caf 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp16_gemm.cu
@@ -115,15 +115,11 @@ using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // O
 using MmaTileShape_MNK = Shape<_256,_128,_64>;                          
 // Shape of the threadblocks in a cluster
 using ClusterShape_MNK = Shape<_2,_2,_1>;
-// Shape of the threadblocks participating in a tcgen05 MMA. <1, 1, 1> for cta_group = 1, <2, 1, 1> for cta_group = 2
-using AtomThrShape_MNK = Shape<_2, _1, _1>;
-// Shape of the tile computed by each SM
-using PerSmTileShape_MNK = decltype(shape_div(MmaTileShape_MNK{}, AtomThrShape_MNK{}));
 
 // Build the epilogue
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass, 
-    PerSmTileShape_MNK, ClusterShape_MNK,
+    MmaTileShape_MNK, ClusterShape_MNK,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
diff --git a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
index 0b1758b9..69a36310 100644
--- a/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
+++ b/examples/70_blackwell_gemm/70_blackwell_fp8_gemm.cu
@@ -131,17 +131,13 @@ using ElementAmax        = float;
 using MmaTileShape_MNK = Shape<_256,_128,_64>;                          
 // Shape of the threadblocks in a cluster
 using ClusterShape_MNK = Shape<_2,_2,_1>;
-// Shape of the threadblocks participating in a tcgen05 MMA. <1, 1, 1> for cta_group = 1, <2, 1, 1> for cta_group = 2
-using AtomThrShape_MNK = Shape<_2, _1, _1>;
-// Shape of the tile computed by each SM
-using PerSmTileShape_MNK = decltype(shape_div(MmaTileShape_MNK{}, AtomThrShape_MNK{}));
 
 using FusionOp = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
   LayoutC, cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementAux, ElementAmax, ElementBias>;
   
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    PerSmTileShape_MNK, ClusterShape_MNK,
+    MmaTileShape_MNK, ClusterShape_MNK,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementCompute,
     ElementC, LayoutC, AlignmentC,
diff --git a/examples/70_blackwell_gemm/CMakeLists.txt b/examples/70_blackwell_gemm/CMakeLists.txt
index d88a8c56..cb401e3a 100644
--- a/examples/70_blackwell_gemm/CMakeLists.txt
+++ b/examples/70_blackwell_gemm/CMakeLists.txt
@@ -28,7 +28,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   70_blackwell_fp16_gemm
   70_blackwell_fp16_gemm.cu
diff --git a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
index 6712d7a9..427af254 100644
--- a/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
+++ b/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu
@@ -184,12 +184,8 @@ struct ExampleRunner {
       std::is_same_v<MainloopScheduleType, cutlass::gemm::KernelTmaWarpSpecialized2SmSm100> ||
       // Auto schedule will try to select 2sm cluster MMA based on cluster M
       std::is_same_v<MainloopScheduleType, cutlass::gemm::collective::KernelScheduleAuto> && size<0>(ClusterShapeMNK{}) % 2 == 0;
-  // The MNK layout of CTAs within a cluster MMA
-  using AtomThrMNK    = std::conditional_t<Use2SmMma, Shape<_2,_1,_1>, Shape<_1,_1,_1>>;
   // The MMA tile used by the mainloop collective. Blackwell 1sm MMA supports up to MMA tile M = 128, 2sm MMA supports up to MMA tile M = 256
   using MmaTileMNK    = std::conditional_t<Use2SmMma, Shape<_256,_128,_64>, Shape<_128,_128,_64>>;
-  // The Output tile used by the epilogue collective
-  using OutputTileMNK = decltype(shape_div(MmaTileMNK{}, AtomThrMNK{}));
 
   // 16B alignment lets us use TMA
   static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
@@ -220,7 +216,7 @@ struct ExampleRunner {
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputTileMNK, ClusterShapeMNK,
+      MmaTileMNK, ClusterShapeMNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, AlignmentC,
@@ -503,20 +499,20 @@ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MIN
   print_result("KernelScheduleAuto mainloop schedule with EpilogueScheduleAuto epilogue schedule and 3 mainloop stages", passed);
 
   // 1SM cluster MMA mainloop schedules can be used with direct store ("no-smem") epilogue schedules
-  ExampleRunner<cutlass::gemm::KernelTmaWarpSpecialized1SmSm100, cutlass::epilogue::NoSmemWarpSpecialized> runner_2;
+  ExampleRunner<cutlass::gemm::KernelTmaWarpSpecialized1SmSm100, cutlass::epilogue::NoSmemWarpSpecialized1Sm> runner_2;
   passed = runner_2.run(options, hw_info);
-  print_result("KernelTmaWarpSpecialized1SmSm100 mainloop schedule with NoSmemWarpSpecialized epilogue schedule", passed);
+  print_result("KernelTmaWarpSpecialized1SmSm100 mainloop schedule with NoSmemWarpSpecialized1Sm epilogue schedule", passed);
 
   // 1SM cluster MMA mainloop schedules can also be used with 1SM TMA epilogue schedules
   // 1SM cluster MMA mainloop schedules will not work with 2SM TMA epilogue schedules
   ExampleRunner<cutlass::gemm::KernelTmaWarpSpecialized1SmSm100, cutlass::epilogue::TmaWarpSpecialized1Sm> runner_3;
   passed = runner_3.run(options, hw_info);
-  print_result("KernelTmaWarpSpecialized1SmSm100 mainloop schedule with NoSmemWarpSpecialized epilogue schedule", passed);
+  print_result("KernelTmaWarpSpecialized1SmSm100 mainloop schedule with TmaWarpSpecialized1Sm epilogue schedule", passed);
 
   // 2SM cluster MMA mainloop schedules can be used with direct store ("no-smem") epilogue schedules
-  ExampleRunner<cutlass::gemm::KernelTmaWarpSpecialized2SmSm100, cutlass::epilogue::NoSmemWarpSpecialized> runner_4;
+  ExampleRunner<cutlass::gemm::KernelTmaWarpSpecialized2SmSm100, cutlass::epilogue::NoSmemWarpSpecialized2Sm> runner_4;
   passed = runner_4.run(options, hw_info);
-  print_result("KernelTmaWarpSpecialized2SmSm100 mainloop schedule with NoSmemWarpSpecialized epilogue schedule", passed);
+  print_result("KernelTmaWarpSpecialized2SmSm100 mainloop schedule with NoSmemWarpSpecialized2Sm epilogue schedule", passed);
 
   // 2SM cluster MMA mainloop schedules can also be used with 2SM TMA epilogue schedules
   // 2SM cluster MMA mainloop schedules will not work with SM TMA epilogue schedules
@@ -556,11 +552,11 @@ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MIN
   // Blackwell direct store epilogue schedule supports custom EVTs and named fusion operations as well (not supported for pre-Blackwell kernels)
   ExampleRunner<
     cutlass::gemm::KernelTmaWarpSpecialized1SmSm100,
-    cutlass::epilogue::NoSmemWarpSpecialized,
+    cutlass::epilogue::NoSmemWarpSpecialized1Sm,
     cutlass::gemm::collective::StageCountAuto,
     UseCustomEVT> runner_9;
   passed = runner_9.run(options, hw_info);
-  print_result("KernelTmaWarpSpecialized1SmSm100 mainloop schedule with NoSmemWarpSpecialized epilogue and custom EVT", passed);
+  print_result("KernelTmaWarpSpecialized1SmSm100 mainloop schedule with NoSmemWarpSpecialized1Sm epilogue and custom EVT", passed);
 
 #endif
 
diff --git a/examples/71_blackwell_gemm_with_collective_builder/CMakeLists.txt b/examples/71_blackwell_gemm_with_collective_builder/CMakeLists.txt
index 5bac6494..a326f461 100644
--- a/examples/71_blackwell_gemm_with_collective_builder/CMakeLists.txt
+++ b/examples/71_blackwell_gemm_with_collective_builder/CMakeLists.txt
@@ -27,7 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # Both filenames are shorter to avoid MAX_PATH issues on Windows.
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   71_blackwell_gemm_with_collective_builder
   71_blackwell_gemm_with_collective_builder.cu
diff --git a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
index ec597966..f7e12fbf 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu
@@ -117,11 +117,10 @@ using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // O
 // Kernel Perf config
 using MmaTileShape        = Shape<_256,_256,_256>;                          // MMA's tile size
 using ClusterShape        = Shape<_4,_4,_1>;                                // Shape of the threadblocks in a cluster
-using PerSmTileShape_MNK  = Shape<_128,_256,_256>;                          // Threadblock-level tile size
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,                      
-    PerSmTileShape_MNK, ClusterShape,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutCTag, AlignmentC,
diff --git a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
index cefa3e92..2719cab9 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu
@@ -121,7 +121,6 @@ using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // O
 // Kernel Perf config
 using MmaTileShape        = Shape<_128,_128,_256>;                          // MMA's tile size
 using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
-using PerSmTileShape_MNK  = Shape<_128,_128,_256>;                          // Threadblock-level tile size
 
 constexpr int InputSFVectorSize  = 16;
 constexpr int OutputSFVectorSize = InputSFVectorSize;
@@ -137,7 +136,7 @@ using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,                     
-    PerSmTileShape_MNK, ClusterShape,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutCTag, AlignmentC,
diff --git a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
index b73f2c94..2784d050 100644
--- a/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
+++ b/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu
@@ -118,11 +118,10 @@ using OperatorClass       = cutlass::arch::OpClassBlockScaledTensorOp;      // O
 // Kernel Perf config
 using MmaTileShape        = Shape<_256,_256,_256>;                          // MMA's tile size
 using ClusterShape        = Shape<_4,_4,_1>;                                // Shape of the threadblocks in a cluster
-using PerSmTileShape_MNK  = Shape<_128,_256,_256>;                          // Threadblock-level tile size
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,                      
-    PerSmTileShape_MNK, ClusterShape,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutCTag, AlignmentC,
diff --git a/examples/72_blackwell_narrow_precision_gemm/CMakeLists.txt b/examples/72_blackwell_narrow_precision_gemm/CMakeLists.txt
index fa80c184..eaeb6600 100644
--- a/examples/72_blackwell_narrow_precision_gemm/CMakeLists.txt
+++ b/examples/72_blackwell_narrow_precision_gemm/CMakeLists.txt
@@ -28,7 +28,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   72a_blackwell_nvfp4_bf16_gemm
   72a_blackwell_nvfp4_bf16_gemm.cu
diff --git a/examples/73_blackwell_gemm_preferred_cluster/CMakeLists.txt b/examples/73_blackwell_gemm_preferred_cluster/CMakeLists.txt
index 0d0f7757..a4a18324 100644
--- a/examples/73_blackwell_gemm_preferred_cluster/CMakeLists.txt
+++ b/examples/73_blackwell_gemm_preferred_cluster/CMakeLists.txt
@@ -28,7 +28,7 @@
 
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   73_blackwell_gemm_preferred_cluster
   blackwell_gemm_preferred_cluster.cu
diff --git a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
index fb62e844..19c4efd1 100644
--- a/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
+++ b/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu
@@ -129,27 +129,22 @@ using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // O
 // MMA and Cluster Tile Shapes
 // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster Shape % 2 == 0 
 using MmaTileShape_MNK = Shape<_256,_128,_64>;                          
-// Shape of the threadblocks participating in a tcgen05 MMA. <1, 1, 1> for cta_group = 1, <2, 1, 1> for cta_group = 2
-using AtomThrShape_MNK = Shape<_2, _1, _1>;
-// Shape of the tile computed by each SM
-using PerSmTileShape_MNK = decltype(shape_div(MmaTileShape_MNK{}, AtomThrShape_MNK{}));
 // Shape of the cluster set to <int,int,_1> to indicate dynamic cluster shape
 using ClusterShape_MNK = Shape<int,int,_1>;
 // When dynamic cluster is used, KernelScheduleAuto always selects mainloop dispatch policy that 
 // lowers to tcgen05 MMA cta_group = 1 as we don't know if the dynamic cluster M dimension will be a multiple of 2
-// To use KernelScheduleAuto, users need to set AtomThrShape_MNK to Shape<1, 1, 1>
-using KernelSchedule = cute::conditional_t<cute::size(AtomThrShape_MNK{}) == 2, 
-  cutlass::gemm::KernelTmaWarpSpecialized2SmSm100, 
-  cutlass::gemm::collective::KernelScheduleAuto>;
+// To use tcgen05 MMA cta_group = 2, users must explicitly use 2sm builder schedules
+using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmSm100;
+using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized2Sm;
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
-    PerSmTileShape_MNK, ClusterShape_MNK,
+    MmaTileShape_MNK, ClusterShape_MNK,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
     ElementC, LayoutC, AlignmentC,
-    cutlass::epilogue::collective::EpilogueScheduleAuto
+    EpilogueSchedule
   >::CollectiveOp;
 
 using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
diff --git a/examples/74_blackwell_gemm_streamk/CMakeLists.txt b/examples/74_blackwell_gemm_streamk/CMakeLists.txt
index 618561f5..5a378241 100644
--- a/examples/74_blackwell_gemm_streamk/CMakeLists.txt
+++ b/examples/74_blackwell_gemm_streamk/CMakeLists.txt
@@ -29,7 +29,7 @@
 
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   74_blackwell_gemm_streamk
   blackwell_gemm_streamk.cu
diff --git a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
index bb99fa4a..8f6def99 100644
--- a/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
+++ b/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu
@@ -133,22 +133,17 @@ using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // O
 // MMA and Cluster Tile Shapes
 // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster Shape % 2 == 0
 using MmaTileShape_MNK = Shape<_256,_128,_64>;
-// Shape of the threadblocks participating in a tcgen05 MMA. <1, 1, 1> for cta_group = 1, <2, 1, 1> for cta_group = 2
-using AtomThrShape_MNK = Shape<_2, _1, _1>;
-// Shape of the tile computed by each SM
-using PerSmTileShape_MNK = decltype(shape_div(MmaTileShape_MNK{}, AtomThrShape_MNK{}));
 // Shape of the cluster set to <int,int,_1> to indicate dynamic cluster shape
 using ClusterShape_MNK = Shape<int,int,_1>;
-// When dynamic cluster is used, KernelScheduleAuto always selects mainloop dispatch policy that
+// When dynamic cluster is used, KernelScheduleAuto always selects mainloop dispatch policy that 
 // lowers to tcgen05 MMA cta_group = 1 as we don't know if the dynamic cluster M dimension will be a multiple of 2
-// To use KernelScheduleAuto, users need to set AtomThrShape_MNK to Shape<1, 1, 1>
-using KernelSchedule = cute::conditional_t<cute::size(AtomThrShape_MNK{}) == 2,
-  cutlass::gemm::KernelTmaWarpSpecialized2SmSm100,
-  cutlass::gemm::collective::KernelScheduleAuto>;
+// To use tcgen05 MMA cta_group = 2, users must explicitly use 2sm builder schedules
+using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmSm100;
+using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized2Sm;
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
-    PerSmTileShape_MNK, ClusterShape_MNK,
+    MmaTileShape_MNK, ClusterShape_MNK,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
index 520d8cee..1d8db6e2 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
@@ -121,32 +121,25 @@ using StageCountType = cutlass::gemm::collective::StageCountAuto;           // S
 
 // Runtime Cluster Shape
 using ClusterShape = Shape<int32_t,int32_t,_1>;
-// For Static Cluster Shape: 
-// using ClusterShape    = Shape<_2,_1,_1>; // for example
-// using AtomThrShape    = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));    // for 2SM config
-// using OutputTileShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));   // for epilogue builder
-// using MmaTileShape    = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));   // for mainloop builder
 
 // Different configs for 1SM and 2SM MMA kernel
 struct MMA1SMConfig {
   using MmaTileShape     = Shape<_128,_256,Int<128 / sizeof(ElementA)>>;
   using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
-  using OutputTileShape  = decltype(shape_div(MmaTileShape{}, Shape<_1,_1,_1>{}));
 };
 
 struct MMA2SMConfig {
   using MmaTileShape     = Shape<_256,_256,Int<128 / sizeof(ElementA)>>;
   using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
-  using OutputTileShape  = decltype(shape_div(MmaTileShape{}, Shape<_2,_1,_1>{}));
 };
 
 template <typename ScheduleConfig>
 struct GivenGemmSchedule {
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
-    typename ScheduleConfig::OutputTileShape, ClusterShape,
+    typename ScheduleConfig::MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
index fa65e508..ee697135 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
@@ -143,31 +143,23 @@ using StageCountType = cutlass::gemm::collective::StageCountAuto;           // S
 
 // Runtime Cluster Shape
 using ClusterShape = Shape<int32_t,int32_t,_1>;
-/* // For Static Cluster Shape: 
-use ClusterShape = Shape<_2,_1,_1> for example
-using AtomThrShape   = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));    // for 2SM config
-using OutputTileShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));  // for epilogue builder
-using MmaTileShape   = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));   // for mainloop builder
-*/
 
 // Different configs for 1SM and 2SM MMA kernel
 struct MMA1SMConfig {
   using MmaTileShape     = Shape<_128,_256,_256>;
   using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100;   // Kernel to launch
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;              // Epilogue to launch
-  using OutputTileShape  = decltype(shape_div(MmaTileShape{}, Shape<_1,_1,_1>{}));
 };
 
 struct MMA2SMConfig {
   using MmaTileShape     = Shape<_256,_256,_256>;
   using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100;   // Kernel to launch
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;              // Epilogue to launch
-  using OutputTileShape  = decltype(shape_div(MmaTileShape{}, Shape<_2,_1,_1>{}));
 };
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, EpilogueOperatorClass,
-    typename MMA1SMConfig::OutputTileShape, ClusterShape,
+    typename MMA1SMConfig::MmaTileShape, ClusterShape,
     Shape<_128,_64>,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -195,7 +187,7 @@ using Gemm = Gemm1SM;
 
 using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, EpilogueOperatorClass,
-    typename MMA2SMConfig::OutputTileShape, ClusterShape,
+    typename MMA2SMConfig::MmaTileShape, ClusterShape,
     Shape<_128,_64>,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
diff --git a/examples/75_blackwell_grouped_gemm/CMakeLists.txt b/examples/75_blackwell_grouped_gemm/CMakeLists.txt
index 2da2d4c4..0ce48662 100644
--- a/examples/75_blackwell_grouped_gemm/CMakeLists.txt
+++ b/examples/75_blackwell_grouped_gemm/CMakeLists.txt
@@ -49,7 +49,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
 set(TEST_RANDOM_PERF --iterations=10)                                               # Random problem sizes
 set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10)                       # Random problem sizes
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   75_blackwell_grouped_gemm
   75_blackwell_grouped_gemm.cu
diff --git a/examples/76_blackwell_conv/CMakeLists.txt b/examples/76_blackwell_conv/CMakeLists.txt
index 8d31d743..e4042aa6 100644
--- a/examples/76_blackwell_conv/CMakeLists.txt
+++ b/examples/76_blackwell_conv/CMakeLists.txt
@@ -28,7 +28,7 @@
 
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   76_blackwell_conv_fprop
   76_blackwell_conv_fprop.cu
diff --git a/examples/77_blackwell_fmha/CMakeLists.txt b/examples/77_blackwell_fmha/CMakeLists.txt
index c840d8ba..90b47387 100644
--- a/examples/77_blackwell_fmha/CMakeLists.txt
+++ b/examples/77_blackwell_fmha/CMakeLists.txt
@@ -49,7 +49,7 @@ set(TEST_GEN_REMAP --b=2 --h=4 --h_k=2 --k=512 --d=128 --verify --remap)
 set(TEST_GEN_CACHEONLY --b=2 --h=4 --h_k=2 --k=512 --d=128 --verify --cache-only)
 
 if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")))
-  if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+  if (CUTLASS_NVCC_ARCHS MATCHES 100a)
   cutlass_example_add_executable(
       77_blackwell_fmha_fp8
       77_blackwell_fmha.cu
diff --git a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
index 48e1da6c..f50e85b4 100644
--- a/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
+++ b/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu
@@ -106,25 +106,23 @@ using ArchTag             = cutlass::arch::Sm100;                           // T
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
 
 // Kernel Perf config
-using ClusterTileShape    = Shape<_256,_128,_16>;                                    // Cluster-level tile shape
-using ClusterShape        = Shape<_2,_1,_1>;                                         // Shape of the threadblocks in a cluster
-using CtaTileShape        = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); // Threadblock-level tile shape
-using MmaTileShape        = Shape<_256,_128,_16>;                                    // Mma instruction shape
+using ClusterShape        = Shape<_2,_1,_1>;                                // Shape of the threadblocks in a cluster
+using MmaTileShape        = Shape<_256,_128,_16>;                           // Mma instruction shape
 
 // Build the epilogue
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     ArchTag, OperatorClass, 
-    CtaTileShape, ClusterShape,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
     ElementC, LayoutC, AlignmentC,
-    cutlass::epilogue::NoSmemWarpSpecialized
+    cutlass::epilogue::NoSmemWarpSpecialized2Sm
   >::CollectiveOp;
 
 // Build the mainloop
 // Note: Emulated BF16x9 kernels need to manually specify a mainloop schedule and cannot use KernelScheduleAuto
-using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecializedFastFP32SmemSm100;
+using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmFastFP32SmemSm100;
 using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
     ArchTag, OperatorClass,
     ElementA, LayoutA, AlignmentA,
diff --git a/examples/78_blackwell_emulated_bf16x9_gemm/CMakeLists.txt b/examples/78_blackwell_emulated_bf16x9_gemm/CMakeLists.txt
index 1b36a4fd..6fcbd062 100644
--- a/examples/78_blackwell_emulated_bf16x9_gemm/CMakeLists.txt
+++ b/examples/78_blackwell_emulated_bf16x9_gemm/CMakeLists.txt
@@ -28,7 +28,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 cutlass_example_add_executable(
   78_blackwell_emulated_bf16x9_gemm
   78_blackwell_emulated_bf16x9_gemm.cu
diff --git a/examples/README.md b/examples/README.md
index ec39bf22..68bf7077 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -254,7 +254,7 @@
 
     Blackwell SM100 GEMM example demonstrating compatible mainloop+epilogue builder schedules and epilogue visitor tree (EVT) construction
 
-* [72a_blackwell_narrow_precision_gemm](72a_blackwell_narrow_precision_gemm)
+* [72_blackwell_narrow_precision_gemm](72_blackwell_narrow_precision_gemm/)
 
     Block-scaled dense GEMM example targeting the NVIDIA Blackwell SM100 Tensor Core MMA using CUTLASS 3.x APIs.
 
@@ -278,6 +278,10 @@
 
     Blackwell SM100 FMHA kernel
 
+* [78_blackwell_emulated_bf16x9_gemm](78_blackwell_emulated_bf16x9_gemm)
+
+    Blackwell SM100 FastFP32 (using BF16 to emulate SGEMM) kernel
+
 # CuTe - Programming Examples
 
 Examples that do not rely on CUTLASS and directly showcase the features of CuTe are located in [cutlass/examples/cute](./cute/).
diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp
index e1950b92..b97fc4c8 100644
--- a/include/cute/arch/config.hpp
+++ b/include/cute/arch/config.hpp
@@ -86,5 +86,3 @@
   #define CUTE_ARCH_FLOAT2_MATH_ENABLED
 #endif
 
-
-
diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp
index a157008c..f5f50647 100644
--- a/include/cute/arch/copy_sm90_desc.hpp
+++ b/include/cute/arch/copy_sm90_desc.hpp
@@ -208,6 +208,7 @@ to_CUtensorMapDataType() {
   if constexpr (is_same_v<T,      uint8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
   if constexpr (is_same_v<T, float_e4m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
   if constexpr (is_same_v<T, float_e5m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T, float_ue8m0_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
   if constexpr (is_same_v<T, type_erased_dynamic_float8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;} else 
   if constexpr (is_same_v<T,     uint16_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT16;   } else
   if constexpr (is_same_v<T,     uint32_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT32;   } else
diff --git a/include/cute/arch/mma_sm100_umma.hpp b/include/cute/arch/mma_sm100_umma.hpp
index d954544f..1f74223b 100644
--- a/include/cute/arch/mma_sm100_umma.hpp
+++ b/include/cute/arch/mma_sm100_umma.hpp
@@ -956,7 +956,7 @@ template <class a_type, class b_type, class c_type, class sf_type,
           UMMA::ScaleIn a_neg = UMMA::ScaleIn::One, UMMA::ScaleIn b_neg = UMMA::ScaleIn::One>
 struct SM100_MMA_MXF4_SS
 {
-  static_assert(M == 128, "SM100_MMA_MXF4_SS M-mode size should be 128 for 1 CTA cluster OMMA.");
+  static_assert(M == 128, "SM100_MMA_MXF4_SS M-mode size should be 128 for 1 CTA cluster MMA.");
   static_assert((N % 8 == 0) && (8 <= N) && (N <= 256), "SM100_MMA_MXF4_SS N-mode size should be a multiple of 8 between 8 and 256.");
   static_assert((VS == 16) || (VS == 32), "SM100_MMA_MXF4_SS Vector size can only be 16 or 32.");
 
diff --git a/include/cute/atom/copy_traits_sm100.hpp b/include/cute/atom/copy_traits_sm100.hpp
index cd344fd5..6a767ae3 100644
--- a/include/cute/atom/copy_traits_sm100.hpp
+++ b/include/cute/atom/copy_traits_sm100.hpp
@@ -45,7 +45,6 @@
 
 namespace cute
 {
-
 template <>
 struct Copy_Traits<SM100_U8x8_LDSM_T>
 {
diff --git a/include/cute/container/array.hpp b/include/cute/container/array.hpp
index ea3eaf72..a431fc4a 100644
--- a/include/cute/container/array.hpp
+++ b/include/cute/container/array.hpp
@@ -372,7 +372,7 @@ void swap(array<T,N>& a, array<T,N>& b)
 /// @return A cute::array of the elements of @c t in reverse order.
 template <class T, size_t N>
 CUTE_HOST_DEVICE constexpr
-cute::array<T,N> reverse(cute::array<T,N> const& t) 
+cute::array<T,N> reverse(cute::array<T,N> const& t)
 {
   if constexpr (N == 0u) {
     return t;
@@ -441,17 +441,6 @@ struct tuple_element<I, cute::array<T,N>>
   using type = T;
 };
 
-template <class T, size_t N>
-struct tuple_size<cute::array<T,N> const>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array<T,N> const>
-{
-  using type = T;
-};
-
 } // end namespace CUTE_STL_NAMESPACE
 
 #ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
@@ -477,16 +466,5 @@ struct tuple_element<I, cute::array<T,N>>
   using type = T;
 };
 
-template <class T, size_t N>
-struct tuple_size<cute::array<T,N> const>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array<T,N> const>
-{
-  using type = T;
-};
-
 } // end namespace std
 #endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/container/array_subbyte.hpp b/include/cute/container/array_subbyte.hpp
index d6b1fafb..38da7ace 100644
--- a/include/cute/container/array_subbyte.hpp
+++ b/include/cute/container/array_subbyte.hpp
@@ -611,17 +611,6 @@ struct tuple_element<I, cute::array_subbyte<T,N>>
   using type = T;
 };
 
-template <class T, size_t N>
-struct tuple_size<const cute::array_subbyte<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, const cute::array_subbyte<T,N>>
-{
-  using type = T;
-};
-
 } // end namespace CUTE_STL_NAMESPACE
 
 #ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
@@ -647,16 +636,5 @@ struct tuple_element<I, cute::array_subbyte<T,N>>
   using type = T;
 };
 
-template <class T, size_t N>
-struct tuple_size<const cute::array_subbyte<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, const cute::array_subbyte<T,N>>
-{
-  using type = T;
-};
-
 } // end namespace std
 #endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/container/packed_tuple.hpp b/include/cute/container/packed_tuple.hpp
deleted file mode 100644
index a7a1c3b2..00000000
--- a/include/cute/container/packed_tuple.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/util/type_traits.hpp>
-#include <cute/numeric/integral_constant.hpp>
-#include <cute/container/type_list.hpp>
-
-namespace cute {
-
-namespace detail {
-
-// Empty Structure Optimization
-template <bool IsFirstEmpty, bool IsRestEmpty, class... T>
-struct ESO;
-
-template <class First, class... Rest>
-static constexpr bool is_first_empty_v = cute::is_empty<First>::value;
-template <class First, class... Rest>
-static constexpr bool is_rest_empty_v  = (cute::is_empty<Rest>::value && ...);
-
-template <class... T>
-using ESO_t = ESO<is_first_empty_v<T...>, is_rest_empty_v<T...>, T...>;
-
-// Empty First and Empty Rest...
-template <class First, class... Rest>
-struct ESO<true, true, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const&, Rest const&...) {}
-};
-
-// NonEmpty First and Empty Rest...
-template <class First, class... Rest>
-struct ESO<false, true, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() : first_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const& first, Rest const&...) : first_{first} {}
-
-  First first_;
-};
-
-// Empty First and NonEmpty Rest...
-template <class First, class... Rest>
-struct ESO<true, false, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() : rest_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const&, Rest const&... rest) : rest_{rest...} {}
-
-  ESO_t<Rest...> rest_;
-};
-
-// NonEmpty T and NonEmpty Rest...
-template <class First, class... Rest>
-struct ESO<false, false, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() : first_{}, rest_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const& first, Rest const&... rest) : first_{first}, rest_{rest...} {}
-
-  First first_;
-  ESO_t<Rest...> rest_;
-};
-
-// Get Nth value from ESO
-template <size_t N, class T, class... Rest, bool F, bool R>
-CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...> const& s) {
-  if constexpr (N == 0) {
-    if constexpr (F) { return T{}; }
-    else             { return static_cast<T const&>(s.first_); }
-  } else {
-    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
-    else             { return getv<N-1>(s.rest_); }
-  }
-}
-
-template <size_t N, class T, class... Rest, bool F, bool R>
-CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>& s) {
-  if constexpr (N == 0) {
-    if constexpr (F) { return T{}; }
-    else             { return static_cast<T&>(s.first_); }
-  } else {
-    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
-    else             { return getv<N-1>(s.rest_); }
-  }
-}
-
-template <size_t N, class T, class... Rest, bool F, bool R>
-CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>&& s) {
-  if constexpr (N == 0) {
-    if constexpr (F) { return T{}; }
-    else             { return static_cast<T&&>(s.first_); }
-  } else {
-    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
-    else             { return getv<N-1>(static_cast<ESO_t<Rest...>&&>(s.rest_)); }
-  }
-}
-
-// findt: Implementation detail of cute::find.
-// If X is the first template argument of the tuple, findt returns C<N>.
-
-template <class X, size_t N,
-  bool IsFirstEmpty, bool IsRestEmpty, class First, class... Rest>
-CUTE_HOST_DEVICE constexpr
-auto
-findt(ESO<IsFirstEmpty, IsRestEmpty, First, Rest...> const& t) noexcept
-{
-  if constexpr (cute::is_same_v<X, First>) {
-    return C<N>{};
-  }
-  else {
-    static_assert(sizeof...(Rest) != 0,
-      "The type does not appear in the argument list of the tuple.");
-    if constexpr (IsRestEmpty) {
-      // The rest is empty, so creating an instance of it is cheap.
-      return cute::detail::findt<X, N+1>(ESO_t<Rest...>{});
-    }
-    else {
-      return cute::detail::findt<X, N+1>(t.rest_);
-    }
-  }
-}
-
-} // end namespace detail
-
-// packed_tuple<T...> is a tuple type that is a standard-layout type
-// whenever all of its template arguments are standard layout types:
-//   (cute::is_standard_layout_v<T> && ...) implies (cute::is_standard_layout_v<packed_tuple<T...>>)
-
-template <class... T>
-struct packed_tuple : detail::ESO_t<T...>
-{
-  CUTE_HOST_DEVICE constexpr
-  packed_tuple() {}
-
-  CUTE_HOST_DEVICE constexpr
-  packed_tuple(T const&... ts)
-    : detail::ESO_t<T...>(ts...)
-  {}
-};
-
-template <>
-struct packed_tuple<> {};
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(packed_tuple<T...> const& t) {
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
-}
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(packed_tuple<T...>& t) {
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
-}
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(packed_tuple<T...>&& t) {
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(static_cast<detail::ESO_t<T...>&&>(t));
-}
-
-template <class... T>
-CUTE_HOST_DEVICE constexpr
-packed_tuple<T...>
-make_packed_tuple(T const&... t)
-{
-  return {t...};
-}
-
-// Returns the position of type X (as a static integer) in the tuple
-// type's argument list.  X must be unique in the argument list.
-template <class X, class... T>
-CUTE_HOST_DEVICE constexpr
-auto
-find(packed_tuple<T...> const& t) noexcept
-{
-  return detail::findt<X, 0>(t);
-}
-
-} // end namespace cute
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class... T>
-struct tuple_size<cute::packed_tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::packed_tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-} // end namespace CUTE_STL_NAMESPACE
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std {
-
-template <class ... T>
-struct tuple_size<cute::packed_tuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class ... T>
-struct tuple_element<I, cute::packed_tuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, cute::packed_tuple<T...>>
-{};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/container/tuple.hpp b/include/cute/container/tuple.hpp
index dab8621e..e62cfe16 100644
--- a/include/cute/container/tuple.hpp
+++ b/include/cute/container/tuple.hpp
@@ -37,169 +37,183 @@
 
 #include <cute/container/cuda_types.hpp>
 #include <cute/container/type_list.hpp>
-#if defined(CUTLASS_USE_PACKED_TUPLE)
-#  include <cute/container/packed_tuple.hpp>
-#endif
-
 //#include <cute/container/array.hpp>            // Advanced optimizations
 
-// cute::tuple is like std::tuple, with two differences.
+// cute::tuple is like std::tuple, with differences:
 //
 // 1. It works on both host and device.
 // 2. Its template arguments must be semiregular types.
+// 3. It is always a standard-layout type if all of its template arguments are standard-layout types.
+// 4. It is always an empty type if all of its template arguments are empty types.
 //
 // Semiregular types are default constructible and copyable.
 // They include "value types" like int or float,
 // but do _not_ include references like int& or float&.
 // (See std::tie for an example of a tuple of references.)
 //
-// If the template arguments of cute::tuple are all empty types (in
-// the sense of std::is_empty_v), then the cute::tuple is also an
-// empty type.  Furthermore, if CUTLASS_USE_PACKED_TUPLE is defined,
-// cute::tuple is always a standard-layout type if all of its template
-// arguments are standard-layout types.
-
-namespace cute
-{
-
-#if defined(CUTLASS_USE_PACKED_TUPLE)
-
-template<class... T>
-using tuple = packed_tuple<T...>;
-
-#else
-
-namespace detail
-{
-
-// This is simplified over the implementations in std::, cuda::std::, and thrust:: by ignoring much of
+// Standard-layout types preserve ABI across host-device boundaries.
+// They are safe to use as device kernel parameters.
+//
+// The cute::tuple is also simplified over the implementations in std::, cuda::std::, and thrust:: by ignoring much of
 // the conversion SFINAE, special overloading, and avoiding cvref template types.
 //
 // Over standard-conforming tuple implementations, this appears to accelerate compilation times by over 3x.
 
-// EBO stands for "empty base optimization."
+namespace cute
+{
+
+namespace detail
+{
+
+// ESO stands for "empty structure optimization."
 // We use this technique to ensure that cute::tuple
-// doesn't need to waste space storing any template arguments
-// of cute::tuple that have no data (like integral_constant).
-// Otherwise, cute::tuple would need to spend at least 1 byte
-// for each of its template arguments.
-//
-// This is one way in which cute::tuple differs from std::tuple.
+// doesn't waste space storing template arguments that have no data (like integral_constant).
 // Empty types in the template argument list are not even constructed,
-// and do not have unique element addresses.  In fact, they are not
-// even members of the tuple or stored in any way.  Calling `get`
+// and do not have unique element addresses. Calling `get`
 // constructs and returns an instance of an empty type on demand.
-//
-// EBO always "holds" a single value of type T.
-// N is like an array index that TupleBase uses
-// to access the desired tuple element.
-template <size_t N, class T, bool IsEmpty = is_empty<T>::value>
-struct EBO;
 
-template <class T, size_t N, bool B>
-CUTE_HOST_DEVICE constexpr C<N> findt(EBO<N, T, B> const&)
-{ return {}; }
+template <bool IsFirstEmpty, bool IsRestEmpty, class... T>
+struct ESO;
 
-// Specialization for types T that have no data;
-// the "static tuple leaf."  Valid T here include
-// integral_constant<U, Value>, Int<Value>,
-// and any other semiregular type
-// for which std::is_empty_v<T> is true.
-template <size_t N, class T>
-struct EBO<N, T, true>
-{
+template <class First, class... Rest>
+static constexpr bool is_first_empty_v = cute::is_empty<First>::value;
+template <class First, class... Rest>
+static constexpr bool is_rest_empty_v  = (cute::is_empty<Rest>::value && ...);
+
+template <class... T>
+using ESO_t = ESO<is_first_empty_v<T...>, is_rest_empty_v<T...>, T...>;
+
+// Empty First and Empty Rest...
+template <class First, class... Rest>
+struct ESO<true, true, First, Rest...> {
   CUTE_HOST_DEVICE constexpr
-  EBO() {}
+  ESO() {}
 
   CUTE_HOST_DEVICE constexpr
-  EBO(T const&) {}
+  ESO(First const&, Rest const&...) {}
 };
 
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T getv(EBO<N, T, true> const&)
-{ return {}; }
-
-// This is a work around approach to solve a shared memory misalign issue (https://github.com/NVIDIA/cutlass/issues/1250).
-// Will remove this work around implementation once the corresponding fix in compiler is released.
-struct dummy_EBO_base {};
-
-// Specialization for types T that are not empty;
-// the "dynamic tuple leaf."  Valid T here include int,
-// any other integral or floating-point type,
-// or any semiregular type for which std::is_empty_v<T> is false.
-template <size_t N, class T>
-struct EBO<N, T, false> : private dummy_EBO_base
-{
+// NonEmpty First and Empty Rest...
+template <class First, class... Rest>
+struct ESO<false, true, First, Rest...> {
   CUTE_HOST_DEVICE constexpr
-  EBO() : t_{} {}
+  ESO() : first_{} {}
 
   CUTE_HOST_DEVICE constexpr
-  EBO(T const& t) : t_{t} {}
+  ESO(First const& first, Rest const&...) : first_{first} {}
 
-  T t_;
+  First first_;
 };
 
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T const& getv(EBO<N, T, false> const& x)
-{ return x.t_; }
-
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T& getv(EBO<N, T, false>& x)
-{ return x.t_; }
-
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T&& getv(EBO<N, T, false>&& x)
-{ return cute::move(x.t_); }
-
-template <class IdxSeq, class... T>
-struct TupleBase;
-
-// Base class of cute::tuple binds each element to an index
-// by inheriting from EBO<i, t> for each (i, t) in (I..., T...).
-// The storage (for nonempty t) lives in the base classes.
-template <size_t... I, class... T>
-struct TupleBase<index_sequence<I...>, T...>
-    : EBO<I,T>...
-{
+// Empty First and NonEmpty Rest...
+template <class First, class... Rest>
+struct ESO<true, false, First, Rest...> {
   CUTE_HOST_DEVICE constexpr
-  TupleBase() {}
+  ESO() : rest_{} {}
 
   CUTE_HOST_DEVICE constexpr
-  TupleBase(T const&... t) : EBO<I,T>(t)... {}
+  ESO(First const&, Rest const&... rest) : rest_{rest...} {}
+
+  ESO_t<Rest...> rest_;
 };
 
+// NonEmpty T and NonEmpty Rest...
+template <class First, class... Rest>
+struct ESO<false, false, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : first_{}, rest_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const& first, Rest const&... rest) : first_{first}, rest_{rest...} {}
+
+  First first_;
+  ESO_t<Rest...> rest_;
+};
+
+// Get Nth value from ESO
+template <size_t N, bool F, bool R, class T, class... Rest>
+CUTE_HOST_DEVICE constexpr
+cute::enable_if_t<cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
+                                 cute::tuple_element_t<N, cute::type_list<T, Rest...>>>
+getv(ESO<F, R, T, Rest...> const&)
+{
+  return {};
+}
+
+template <size_t N, bool F, bool R, class T, class... Rest>
+CUTE_HOST_DEVICE constexpr
+cute::enable_if_t<not cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
+                                     cute::tuple_element_t<N, cute::type_list<T, Rest...>> const&>
+getv(ESO<F, R, T, Rest...> const& s)
+{
+  if constexpr (N == 0) {
+    return static_cast<T const&>(s.first_);
+  } else {
+    return getv<N-1>(s.rest_);
+  }
+}
+
+template <size_t N, bool F, bool R, class T, class... Rest>
+CUTE_HOST_DEVICE constexpr
+cute::enable_if_t<not cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
+                                     cute::tuple_element_t<N, cute::type_list<T, Rest...>> &>
+getv(ESO<F, R, T, Rest...>& s)
+{
+  if constexpr (N == 0) {
+    return static_cast<T&>(s.first_);
+  } else {
+    return getv<N-1>(s.rest_);
+  }
+}
+
+template <size_t N, bool F, bool R, class T, class... Rest>
+CUTE_HOST_DEVICE constexpr
+cute::enable_if_t<not cute::is_empty<cute::tuple_element_t<N, cute::type_list<T, Rest...>>>::value,
+                                     cute::tuple_element_t<N, cute::type_list<T, Rest...>> &&>
+getv(ESO<F, R, T, Rest...>&& s)
+{
+  if constexpr (N == 0) {
+    return static_cast<T&&>(s.first_);
+  } else {
+    return getv<N-1>(static_cast<ESO_t<Rest...>&&>(s.rest_));
+  }
+}
+
+template <class X, size_t N,
+          bool IsFirstEmpty, bool IsRestEmpty, class First, class... Rest>
+CUTE_HOST_DEVICE constexpr
+auto
+findt(ESO<IsFirstEmpty, IsRestEmpty, First, Rest...> const& t) noexcept
+{
+  if constexpr (cute::is_same_v<X, First>) {
+    return C<N>{};
+  } else
+  if constexpr (sizeof...(Rest) == 0) {
+    return C<N+1>{};
+  } else
+  if constexpr (IsRestEmpty) {
+    return cute::detail::findt<X, N+1>(ESO_t<Rest...>{});
+  } else {
+    return cute::detail::findt<X, N+1>(t.rest_);
+  }
+}
+
 } // end namespace detail
 
-// Attempting to use the following commented-out alias
-// in the declaration of `struct tuple` causes MSVC 2022 build errors.
-//
-//template <class... T>
-//using TupleBase = detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>;
-
-// This is the actual cute::tuple class.
-// The storage (if any) lives in TupleBase's EBO base classes.
-//
-// Inheriting from the above alias TupleBase
-// causes MSVC 2022 build errors when assigning one tuple to another:
-// In summary: this is verbose as a work-around for MSVC build errors.
 template <class... T>
-struct tuple : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>
+struct tuple : detail::ESO_t<T...>
 {
   CUTE_HOST_DEVICE constexpr
   tuple() {}
 
   CUTE_HOST_DEVICE constexpr
-  tuple(T const&... t) : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>(t...) {}
+  tuple(T const&... t) : detail::ESO_t<T...>(t...) {}
 };
 
 template <>
-struct tuple<>
-{};
-
-//
-// get for cute::tuple (just like std::get for std::tuple)
-//
+struct tuple<> {};
 
+// Returns the element in the ith position of the tuple
 template <size_t I, class... T>
 CUTE_HOST_DEVICE constexpr
 decltype(auto)
@@ -224,25 +238,19 @@ decltype(auto)
 get(tuple<T...>&& t) noexcept
 {
   static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(static_cast<tuple<T...>&&>(t));
+  return detail::getv<I>(static_cast<detail::ESO_t<T...>&&>(t));
 }
 
-//
-// find a type X within a cute::tuple
-//   Requires X to be unique in tuple
-//   Returns a static integer
-//
-
+// Returns the position of type X (as a static integer) in the tuple
+// type's argument list.  X must be unique in the argument list.
 template <class X, class... T>
 CUTE_HOST_DEVICE constexpr
 auto
 find(tuple<T...> const& t) noexcept
 {
-  return detail::findt<X>(t);
+  return detail::findt<X, 0>(t);
 }
 
-#endif // CUTLASS_USE_PACKED_TUPLE
-
 //
 // Custom is_tuple trait simply checks the existence of tuple_size
 //      and assumes std::get<I>(.), std::tuple_element<I,.>
@@ -258,7 +266,7 @@ auto has_tuple_size(...) -> false_type;
 template <class T>
 struct is_tuple : decltype(detail::has_tuple_size((T*)0)) {};
 
-template<typename T>
+template <class T>
 constexpr bool is_tuple_v = cute::is_tuple<T>::value;
 
 //
@@ -679,8 +687,6 @@ CUTE_HOST std::ostream& operator<<(std::ostream& os, Tuple const& t)
 
 } // end namespace cute
 
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-
 namespace CUTE_STL_NAMESPACE
 {
 
@@ -694,22 +700,8 @@ struct tuple_element<I, cute::tuple<T...>>
     : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
 {};
 
-template <class... T>
-struct tuple_size<const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
 } // end namespace CUTE_STL_NAMESPACE
 
-//
-// std compatibility
-//
-
 #ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
 namespace std
 {
@@ -732,17 +724,5 @@ struct tuple_element<I, cute::tuple<T...>>
     : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
 {};
 
-template <class... T>
-struct tuple_size<const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
 } // end namespace std
 #endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
-
-#endif // CUTLASS_USE_PACKED_TUPLE
diff --git a/include/cute/container/type_list.hpp b/include/cute/container/type_list.hpp
index 44001b6d..b8ac5f0d 100644
--- a/include/cute/container/type_list.hpp
+++ b/include/cute/container/type_list.hpp
@@ -73,17 +73,6 @@ struct tuple_element<I, cute::type_list<T...>>
   using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
 };
 
-template <class... T>
-struct tuple_size<const cute::type_list<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
-
 } // end namespace std
 
 #ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
@@ -109,16 +98,5 @@ struct tuple_element<I, cute::type_list<T...>>
   using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
 };
 
-template <class... T>
-struct tuple_size<const cute::type_list<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
-
 } // end namespace std
 #endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp
index a5bef3ec..557e1103 100644
--- a/include/cute/int_tuple.hpp
+++ b/include/cute/int_tuple.hpp
@@ -330,7 +330,7 @@ ceil_div(IntTupleA const& a, IntTupleB const& b)
       constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
       return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return ceil_div(x,y); });
     } else {                                     // tuple int
-      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
+      auto [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
         [] (auto const& init, auto const& ai) {
           return cute::make_tuple(append(get<0>(init), ceil_div(ai, get<1>(init))), ceil_div(get<1>(init), ai));
         });
@@ -390,7 +390,7 @@ shape_div(IntTupleA const& a, IntTupleB const& b)
       static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
       return transform(a, b, [](auto const& x, auto const& y) { return shape_div(x,y); });
     } else {                                     // tuple int
-      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
+      auto [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
         [] (auto const& init, auto const& ai) {
           return cute::make_tuple(append(get<0>(init), shape_div(ai, get<1>(init))), shape_div(get<1>(init), ai));
         });
diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
index c1a275c9..adf460bb 100644
--- a/include/cute/layout.hpp
+++ b/include/cute/layout.hpp
@@ -1044,7 +1044,7 @@ composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
     auto result_shape_0  = take<0,R-1>(lhs_shape);
 
     // Mod out the rhs_shape from the lhs_shape
-    auto const [result_shape_1, rest_shape]  = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_shape),
+    auto [result_shape_1, rest_shape]  = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_shape),
       [] (auto const& init, auto const& si) {
         return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
       });
@@ -1058,7 +1058,7 @@ composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
     auto result_stride_0 = take<0,R-1>(lhs_stride);
 
     // Divide out the rhs_stride from the lhs_shape
-    auto const [result_shape_1, rest_stride] = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_stride),
+    auto [result_shape_1, rest_stride] = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_stride),
       [] (auto const& init, auto const& di) {
         return cute::make_tuple(append(get<0>(init), shape_div(di, get<1>(init))), shape_div(get<1>(init), di));
       });
@@ -1067,7 +1067,7 @@ composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
     auto result_stride_1 = elem_scale(result_stride_0, shape_div(result_shape_0, result_shape_1));
 
     // Mod out the rhs_shape from the lhs_shape
-    auto const [result_shape_2, rest_shape] = fold(result_shape_1, cute::make_tuple(cute::make_tuple(), rhs_shape),
+    auto [result_shape_2, rest_shape] = fold(result_shape_1, cute::make_tuple(cute::make_tuple(), rhs_shape),
       [] (auto const& init, auto const& si) {
         return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
       });
diff --git a/include/cute/numeric/arithmetic_tuple.hpp b/include/cute/numeric/arithmetic_tuple.hpp
index 32163072..3c2c23cc 100644
--- a/include/cute/numeric/arithmetic_tuple.hpp
+++ b/include/cute/numeric/arithmetic_tuple.hpp
@@ -508,16 +508,6 @@ struct tuple_element<I, cute::ArithmeticTuple<T...>>
   : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
 {};
 
-template <class... T>
-struct tuple_size<const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
 } // end namespace CUTE_STL_NAMESPACE
 
 #ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
@@ -542,15 +532,5 @@ struct tuple_element<I, cute::ArithmeticTuple<T...>>
   : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
 {};
 
-template <class... T>
-struct tuple_size<const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
 } // end namespace std
 #endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/numeric/int.hpp b/include/cute/numeric/int.hpp
index c2e7456e..485c07d5 100644
--- a/include/cute/numeric/int.hpp
+++ b/include/cute/numeric/int.hpp
@@ -84,7 +84,6 @@ using CUTE_STL_NAMESPACE::uint16_t;
 using CUTE_STL_NAMESPACE::uint32_t;
 using CUTE_STL_NAMESPACE::uint64_t;
 using cutlass::uint128_t;
-
 template <int N> struct uint_bit;
 template <> struct uint_bit<  1> { using type = uint1_t; };
 template <> struct uint_bit<  2> { using type = uint2_t; };
@@ -95,7 +94,6 @@ template <> struct uint_bit< 16> { using type = uint16_t; };
 template <> struct uint_bit< 32> { using type = uint32_t; };
 template <> struct uint_bit< 64> { using type = uint64_t; };
 template <> struct uint_bit<128> { using type = cutlass::uint128_t; };
-
 template <int N>
 using uint_bit_t = typename uint_bit<N>::type;
 
diff --git a/include/cute/tensor_impl.hpp b/include/cute/tensor_impl.hpp
index 5218ba37..be22ab37 100644
--- a/include/cute/tensor_impl.hpp
+++ b/include/cute/tensor_impl.hpp
@@ -235,7 +235,7 @@ struct Tensor
   decltype(auto)
   operator()(Coord const& coord) {
     if constexpr (has_underscore<Coord>::value) {
-      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
+      auto [sliced_layout,offset] = slice_and_offset(coord, layout());
       return make_tensor(data() + offset, sliced_layout);
     } else {
       return data()[layout()(coord)];
@@ -249,7 +249,7 @@ struct Tensor
   decltype(auto)
   operator()(Coord const& coord) const {
     if constexpr (has_underscore<Coord>::value) {
-      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
+      auto [sliced_layout,offset] = slice_and_offset(coord, layout());
       return make_tensor(data() + offset, sliced_layout);
     } else {
       return data()[layout()(coord)];
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
index 10b6af8a..8dea5800 100644
--- a/include/cutlass/arch/config.h
+++ b/include/cutlass/arch/config.h
@@ -102,6 +102,7 @@
     #if (!defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM100_ALL))
       #define CUTLASS_ARCH_MMA_SM100A_ENABLED 1
     #endif
+
   #endif
 #endif
 
diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h
index 766c2223..707e1d75 100644
--- a/include/cutlass/arch/reg_reconfig.h
+++ b/include/cutlass/arch/reg_reconfig.h
@@ -38,10 +38,14 @@
 #include "cutlass/cutlass.h"
 
 #ifndef CUDA_CTA_RECONFIG_ACTIVATED
-  #if (__CUDACC_VER_MAJOR__ >= 12 && \
-    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+  #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (             \
+         (__CUDA_ARCH__ ==  900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))      \
+      || (__CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))     \
+    )
     #define CUDA_CTA_RECONFIG_ACTIVATED 1
   #endif
+
+
 #endif
 
 namespace cutlass {
diff --git a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
index 74b0e011..27eed799 100644
--- a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
@@ -106,7 +106,6 @@ struct CollectiveConv<
   
   using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
 
-  // TODO: move pipeline mode tiling into the collective setup phase instead
   static_assert(rank(SmemLayoutA{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
   static_assert((size<0>(TileShape{}) == size<0>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
   static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
index b5d78ce5..fbef858a 100644
--- a/include/cutlass/conv/conv2d_problem_size.h
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -255,23 +255,27 @@ public:
   CUTLASS_HOST_DEVICE
   int64_t activation_size() const {
 
-    return (N * H * W * C);
+    return static_cast<int64_t>(N) * static_cast<int64_t>(H) *
+           static_cast<int64_t>(W) * static_cast<int64_t>(C);
   }
 
   /// Returns filter size in number of elements
   CUTLASS_HOST_DEVICE
   int64_t filter_size() const {
 
-    return (K * R * S * C / groups);
+    return static_cast<int64_t>(K) * static_cast<int64_t>(R) *
+           static_cast<int64_t>(S) * static_cast<int64_t>(C) /
+           static_cast<int64_t>(groups);
   }
 
   /// Returns output size in number of elements
   CUTLASS_HOST_DEVICE
   int64_t output_size() const {
 
-    return (N * P * Q * K);
+    return static_cast<int64_t>(N) * static_cast<int64_t>(P) *
+           static_cast<int64_t>(Q) * static_cast<int64_t>(K);
   }
-  
+
   /// Returns padding as Tensor4DCoord
   CUTLASS_HOST_DEVICE
   cutlass::Tensor4DCoord padding() const {
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
index a7e08361..48bf056e 100644
--- a/include/cutlass/conv/conv3d_problem_size.h
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -285,21 +285,27 @@ public:
   CUTLASS_HOST_DEVICE
   int64_t activation_size() const {
 
-    return (N * D * H * W * C);
+    return static_cast<int64_t>(N) * static_cast<int64_t>(D) *
+           static_cast<int64_t>(H) * static_cast<int64_t>(W) *
+           static_cast<int64_t>(C);
   }
 
   /// Returns filter size in number of elements
   CUTLASS_HOST_DEVICE
   int64_t filter_size() const {
 
-    return (K * T * R * S * C);
+    return static_cast<int64_t>(K) * static_cast<int64_t>(T) *
+           static_cast<int64_t>(R) * static_cast<int64_t>(S) *
+           static_cast<int64_t>(C);
   }
 
   /// Returns output size in number of elements
   CUTLASS_HOST_DEVICE
   int64_t output_size() const {
 
-    return (N * Z * P * Q * K);
+    return static_cast<int64_t>(N) * static_cast<int64_t>(Z) *
+           static_cast<int64_t>(P) * static_cast<int64_t>(Q) *
+           static_cast<int64_t>(K);
   }
 
   /// Returns padding as Coord3D
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h
index f166afc8..a9aae87b 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -114,6 +114,33 @@ public:
       return status;
     }
 
+    // Check that tensor sizes don't exceed maximum supported size
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.activation_size() * sizeof(ElementA) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementB) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementC) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else if (kConvolutionalOperator == conv::Operator::kDgrad ||
+               kConvolutionalOperator == conv::Operator::kDeconv) {
+      if (args.problem_size.activation_size() * sizeof(ElementC) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementB) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementA) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+      if (args.problem_size.activation_size() * sizeof(ElementB) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementC) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementA) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
     // check group conv constraint
     if (args.problem_size.groups != 1) {
       if (kGroupMode == conv::GroupMode::kNone) {
diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
index c9cd4421..98e77893 100644
--- a/include/cutlass/cuda_host_adapter.hpp
+++ b/include/cutlass/cuda_host_adapter.hpp
@@ -104,7 +104,7 @@ namespace cutlass {
 
 #else // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
 
-#if (__CUDACC_VER_MAJOR__ >= 13)
+#if (__CUDACC_VER_MAJOR__ > 12)
 
 #define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
   template <typename... Args>                                   \
@@ -142,7 +142,7 @@ namespace cutlass {
     return reinterpret_cast<PFN_##func>(pfn)(args...);          \
   }
 
-#endif // (__CUDACC_VERSION__ >= 12.5)
+#endif // (__CUDACC_VER_MAJOR__ > 12)
 
 #endif // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
 
diff --git a/include/cutlass/detail/collective/mixed_input_utils.hpp b/include/cutlass/detail/collective/mixed_input_utils.hpp
index c9042351..aed30bee 100644
--- a/include/cutlass/detail/collective/mixed_input_utils.hpp
+++ b/include/cutlass/detail/collective/mixed_input_utils.hpp
@@ -69,7 +69,7 @@ struct LayoutAwareConvertImpl {
     auto&& src_vm = cute::recast<SrcArray>(src);
     auto&& dst_vm = cute::recast<DstArray>(dst);
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i <src.size(); ++i) {
+    for (int i = 0; i < src_vm.size(); ++i) {
       dst_vm(i) = Converter::convert(src_vm(i));
     }
   }
diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
index 1d58d30f..4cb02533 100644
--- a/include/cutlass/detail/helper_macros.hpp
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -60,7 +60,7 @@
 #if ! defined(_MSC_VER)
 #define CUTLASS_LAMBDA_FUNC_INLINE __attribute__((always_inline))
 #else
-#define CUTLASS_LAMBDA_FUNC_INLINE
+#define CUTLASS_LAMBDA_FUNC_INLINE [[msvc::forceinline]]
 #endif
 
 #define CUTLASS_HOST __host__
diff --git a/include/cutlass/epilogue/collective/builders/sm100_builder.inl b/include/cutlass/epilogue/collective/builders/sm100_builder.inl
index c3a23387..82c4b906 100644
--- a/include/cutlass/epilogue/collective/builders/sm100_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm100_builder.inl
@@ -98,142 +98,6 @@ sm100_get_epilogue_smem_swizzle_layout_atom() {
   }
 }
 
-// Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
-template <
-  class OpClass,
-  class CtaTileShape_MNK,
-  class EpilogueTileType,
-  class TmemWarpShape_MN,
-  class ElementC,
-  class StrideC,
-  class ElementD,
-  class StrideD,
-  class FusionOp
->
-constexpr auto
-sm100_compute_tile_shape_or_override() {
-  using namespace cute;
-
-  if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>  && 
-                cute::is_same_v<OpClass, arch::OpClassBlockScaledTensorOp> && 
-                size<1>(CtaTileShape_MNK{}) == 256) {
-    constexpr int CtaM = size<0>(CtaTileShape_MNK{});
-    constexpr int WarpM = size<0>(TmemWarpShape_MN{});
-    constexpr int DpFull = 32;
-    constexpr int M = cute::min(CtaM, DpFull * WarpM); // target 32dp tmem load
-    // Note: 
-    // Set Epi_Tile_N to 128 support OverlappingAccum for the largest tile.
-    // This is a general workable epi_tile_N which does not promise best perf.
-    return make_tile(Int<M>{}, Int<128>{}); 
-  }
-  else if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
-    constexpr int CtaM = size<0>(CtaTileShape_MNK{});
-    constexpr int CtaN = size<1>(CtaTileShape_MNK{});
-    constexpr int WarpM = size<0>(TmemWarpShape_MN{});
-    constexpr int WarpN = size<1>(TmemWarpShape_MN{});
-    constexpr bool DisableSource = is_void_v<ElementC>;
-    constexpr int MaxBits = cute::max(sizeof_bits_v<ElementC>, sizeof_bits_v<ElementD>);
-
-    constexpr int DpFull = 32; // tmem datapaths in 1 subpartition
-    constexpr int M = cute::min(CtaM, DpFull * WarpM); // target 32dp tmem load
-    constexpr int N_perf = [&]() constexpr { // Known subtile sizes tested for perf
-      // Epilogues w/o residual load are less sensitive to smem allocation
-      // Target a fixed amount of compute per epilogue iteration
-      if (DisableSource) {
-        if (MaxBits == 4) {
-          // Make epilogue tile larger to reduce the epilogue iterations.
-          // 64 is the experimental value. It will minimize epilogue iterations but keep the number of A/B buffers the same.
-          constexpr int ComputeElts = 8192;
-          return ComputeElts / M;
-        }
-        constexpr int ComputeElts = 4096;
-        return ComputeElts / M;
-      }
-      // Epilogues w/ residual load are more sensitive to smem allocation
-      // Target optimal smem distribution between epilogue+mainloop based on datatype+tilesize
-      else {
-        if (MaxBits == 32) {
-          return (CtaM > 64 && CtaN <= 128) ? 16 : 32;
-        }
-        // Per-column scaling is high register pressure, reduce tile to prevent spills
-        else if (FusionOp::IsPerColScaleSupported) {
-          return 32;
-        }
-        else if (MaxBits == 16) {
-          return (CtaN <= 128) ? 32 : 64;
-        }
-        else {
-          return 64;
-        }
-      }
-    }();
-    constexpr int N_min_C = (DisableSource || detail::is_m_major<StrideC>()) ? 8 * WarpN
-                              : (sizeof_bits_v<ElementC> == 6) ? 128 * WarpN // TMA store only supports SW128B for FP6 data type
-                                                               : 128 / sizeof_bits_v<ElementC> * WarpN;
-    constexpr int N_min_D = (detail::is_m_major<StrideD>()) ? 8 * WarpN
-                              : (sizeof_bits_v<ElementD> == 6) ? 128 * WarpN // TMA store only supports SW128B for FP6 data type
-                                                               : 128 / sizeof_bits_v<ElementD> * WarpN;
-    constexpr int N = cute::min(CtaN, cute::max(N_perf, N_min_C, N_min_D));
-    static_assert(CtaN >= N_min_C && CtaN >= N_min_D, "CTA tile too small");
-
-    // stride by tmem warp layout and return a by-mode tiler
-    auto tile_m = Layout<Int<M>>{};
-    auto tile_n = Layout<Shape <Int<N / WarpN>,Int<        WarpN>>,
-                         Stride<Int<         1>,Int<CtaN / WarpN>>>{};
-
-    return make_tile(tile_m, coalesce(tile_n));
-  }
-  else if constexpr (cute::is_tuple<EpilogueTileType>::value) {
-    EpilogueTileType epi_tile;
-    constexpr int M = size<0>(shape(epi_tile));
-    constexpr int N = size<1>(shape(epi_tile));
-
-    static_assert(!is_layout<EpilogueTileType>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
-    static_assert(TmemWarpShape_MN{} == Shape<_2,_2>{} && (M == 32 || M == 64) ||
-                  TmemWarpShape_MN{} == Shape<_4,_1>{} && (M == 64 || M == 128), "Unsupported tile shape");
-    static_assert(N % 8 == 0, "Unsupported tile shape");
-
-    return epi_tile;
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<EpilogueTileType>, "Invalid type for EpilogueTileType.");
-  }
-}
-
-template <class EpilogueScheduleType>
-static constexpr bool IsPtrArrayDispatchPolicy =
-  cute::is_same_v<EpilogueScheduleType, PtrArrayTmaWarpSpecialized1Sm> ||
-  cute::is_same_v<EpilogueScheduleType, PtrArrayTmaWarpSpecialized2Sm>;
-
-
-template <
-  class CtaTileShape_MNK,
-  class EpilogueTile_MN,
-  class ElementC,
-  class ElementD,
-  class Schedule
->
-constexpr auto
-sm100_get_tma_dispatch_policy() {
-  using EpilogueTileShape_MN = decltype(product_each(shape(EpilogueTile_MN{})));
-  constexpr int EpiTiles = size(shape_div(take<0,2>(CtaTileShape_MNK{}), EpilogueTileShape_MN{}));
-  constexpr int FragmentSize = size(EpilogueTileShape_MN{}) / NumThreadsPerWarpGroup;
-  // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
-  constexpr bool ReuseSmem = sizeof_bits_v<ElementC> > 8;
-  constexpr bool DelayTmaStore = false;
-  constexpr int StagesD = cute::min(EpiTiles, 2);
-  constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 4), StagesD+1)
-                                    : cute::min(EpiTiles, 4);
-
-    if constexpr (detail::IsPtrArrayDispatchPolicy<Schedule>) {
-      return Sm100PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
-    }
-    else
-    {
-      return Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
-    }
-}
-
 /*
  * Returns the TMEM_LOAD copy op to be used for the epilogue
  * Returned TMEM_LOAD op is such that the thread-value ownership matches the widest available
@@ -344,10 +208,10 @@ sm100_get_tmem_load_op() {
   // For complex TF32 kernels
   else if constexpr (sizeof_bits_v<ElementAccumulator> == 64 && sizeof_bits_v<ElementD> == 64) {
     if constexpr (num_dp == 16) {
-      return TMEM::op_repeater<SM100_TMEM_LOAD_16dp256b1x, num_col_bits>();
+      return TMEM::op_repeater<SM100_TMEM_LOAD_16dp256b1x, num_col_bits/2>();
     }
     else {
-      return TMEM::op_repeater<SM100_TMEM_LOAD_32dp32b1x, num_col_bits>();
+      return TMEM::op_repeater<SM100_TMEM_LOAD_32dp32b1x, num_col_bits/2>();
     }
   }
   // For narrow precision output
@@ -376,7 +240,6 @@ sm100_get_smem_store_op() {
   static_assert(is_m_major || is_n_major, "Unsupported gmem layout");
 
   // Check for TMEM_LOAD layouts that match the thread-value ownership pattern of stmatrix
-  // TODO: check copy vectorization instead!
   constexpr bool use_stmatrix_m8n8_4x =
     (sizeof_bits_v<ElementAccumulator> == 32 && sizeof_bits_v<ElementD> == 32 && is_n_major &&
       ( cute::is_same_v<AccLoadOp, SM100_TMEM_LOAD_16dp128b2x>  ||
@@ -451,22 +314,7 @@ sm100_get_smem_store_op() {
   }
 }
 
-template <class GmemStrideTypeD, class ElementD>
-constexpr auto
-sm100_get_register_transform_op() {
-  using namespace cute;
 
-  [[maybe_unused]] constexpr bool is_m_major = cutlass::detail::is_major<0>(GmemStrideTypeD{});
-  [[maybe_unused]] constexpr bool is_n_major = cutlass::detail::is_major<1>(GmemStrideTypeD{});
-  static_assert(is_m_major || is_n_major, "Unsupported gmem layout");
-
-  if constexpr (sizeof_bits_v<ElementD> == 4 && is_m_major) {
-    return SM50_Shuffle_U32_2x2Trans_XOR1{};
-  }
-  else {
-    return AutoVectorizingCopyWithAssumedAlignment<128>{};
-  }
-}
 
 // Selects the largest vectorized smem load atom available
 // subject to constraint of gmem layout and chosen TMEM_LOAD's thread-value ownership
@@ -503,30 +351,6 @@ sm100_get_smem_load_op() {
   }
 }
 
-template <class Schedule, class LayoutTag>
-constexpr auto
-sm100_get_gmem_load_op() {
-  if constexpr (detail::is_im2col_mode<LayoutTag>) {
-    return SM90_TMA_LOAD_IM2COL{};
-  }
-  else {
-  
-    return SM90_TMA_LOAD{};
-  } 
-}
-
-template <class Schedule, class LayoutTag>
-constexpr auto
-sm100_get_gmem_store_op() {
-  if constexpr (detail::is_im2col_mode<LayoutTag>) {
-    return SM90_TMA_STORE_IM2COL{};
-  }
-  else {
-  
-    return SM90_TMA_STORE{};
-  } 
-}
-
 // aux fusion callbacks builder for sm100 tma epilogue
 template <
   int StagesC,
@@ -622,9 +446,9 @@ struct CallbacksBuilder<
 // the fusion operation performed and the dispatch policy to use.
 template <
   class OpClass,
-  class CtaTileShape_MNK,
+  class MmaTileShape_MNK,
+  class ClusterShape_MNK,
   class EpilogueTileType,
-  class TmemWarpShape_MN,
   class ElementAccumulator,
   class ElementCompute,
   class ElementC_,
@@ -637,62 +461,237 @@ template <
   class FusionOpOrCallbacks
 >
 struct Sm100TmaBuilderImpl {
+private:
+  static constexpr bool Is1SmMma = is_base_of_v<TmaWarpSpecialized1Sm, Schedule>;
+  static constexpr bool Is2SmMma = is_base_of_v<TmaWarpSpecialized2Sm, Schedule>;
+  static_assert(Is1SmMma ^ Is2SmMma, "unsupported schedule");
+  static_assert(not (Is2SmMma && size<0>(ClusterShape_MNK{}) % 2 == 1), "schedule + cluster mismatch");
+
   // Passing void C disables source load + smem allocation
-  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
-  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
-
-  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
-  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
-
-  using CopyOpS2G = decltype(detail::sm100_get_gmem_store_op<Schedule,GmemLayoutTagD>());
-  using CopyOpG2S = decltype(detail::sm100_get_gmem_load_op<Schedule,GmemLayoutTagC>());
-
-  using FusionOp = conditional_t<is_base_of_v<epilogue::fusion::FusionOperation, FusionOpOrCallbacks>,
-                                  FusionOpOrCallbacks, epilogue::fusion::FusionOperation>;
-
-  using EpilogueTile_MN = decltype(detail::sm100_compute_tile_shape_or_override<
-      OpClass, CtaTileShape_MNK, EpilogueTileType, TmemWarpShape_MN,
-      ElementC_, GmemStrideTypeC, ElementD, GmemStrideTypeD, FusionOp>());
-  using EpilogueTileShape_MN = decltype(product_each(shape(EpilogueTile_MN{})));
-  using EpilogueWarpTileShape_MN = decltype(shape_div(EpilogueTileShape_MN{}, TmemWarpShape_MN{}));
-  using AccLoadOp = decltype(detail::sm100_get_tmem_load_op<
-      GmemStrideTypeD, ElementAccumulator, ElementD, EpilogueWarpTileShape_MN, FusionOp>());
+  static constexpr bool DisableSource = cute::is_void_v<ElementC_>;
+  using ElementC = cute::conditional_t<DisableSource,ElementD,ElementC_>; // prevents void ref breakages
+  using GmemLayoutTagC = cute::conditional_t<DisableSource,GmemLayoutTagD,GmemLayoutTagC_>;
 
   using InternalSmemElementC = typename cutlass::detail::get_unpacked_element_type<ElementC>::type;
   using InternalSmemElementD = typename cutlass::detail::get_unpacked_element_type<ElementD>::type;
 
-  using DispatchPolicy = decltype(detail::sm100_get_tma_dispatch_policy<
-      CtaTileShape_MNK, EpilogueTile_MN, ElementC_, ElementD, Schedule>());
+  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
+
   // TMA builder allows for passing callbacks directly, which is either a fusion::FusionCallbacks
   // instance or a direct visitor implementation, e.g. fusion::Sm90LinearCombination
-  using FusionCallbacks =
-    typename CallbacksBuilder<
-      DispatchPolicy,
-      FusionOpOrCallbacks,
-      CtaTileShape_MNK,
-      EpilogueTile_MN,
-      ElementAccumulator,
-      AccLoadOp
-    >::Callbacks;
+  static constexpr bool IsTaggedFusionOp = is_base_of_v<epilogue::fusion::FusionOperation, FusionOpOrCallbacks>;
+  using FusionOp = conditional_t<IsTaggedFusionOp, FusionOpOrCallbacks, epilogue::fusion::FusionOperation>;
 
+  static constexpr auto
+  cta_tile_shape() {
+    if constexpr (Is2SmMma) { // 2x1 threadblock shape
+      auto [mma_tile_m, mma_tile_n, mma_tile_k] = MmaTileShape_MNK{};
+      auto cta_tile_m = reverse(shape_div(reverse(mma_tile_m), _2{})); // first MmaTile_M/2 elements, preserve multimode
+      return make_shape(cta_tile_m, mma_tile_n, mma_tile_k);
+    }
+    else { // 1x1 threadblock shape
+      return MmaTileShape_MNK{};
+    }
+  }
+  using CtaTileShape_MNK = decltype(cta_tile_shape());
+
+  static constexpr auto
+  tmem_warps() {
+    if constexpr (Is2SmMma && size<0>(MmaTileShape_MNK{}) == 128) {
+      return Shape<_2,_2>{};
+    }
+    else {
+      return Shape<_4,_1>{};
+    }
+  }
+  using TmemWarpShape_MN = decltype(tmem_warps());
+
+  // Attempts to compute a reasonably performant epilogue tile or allows the user to provide one.
+  static constexpr auto
+  epilogue_tile() {
+    using namespace cute;
+    
+    if constexpr (is_same_v<OpClass, arch::OpClassBlockScaledTensorOp> && 
+                  is_same_v<EpilogueTileType, EpilogueTileAuto> && 
+                  size<1>(CtaTileShape_MNK{}) == 256) {
+      constexpr int CtaM = size<0>(CtaTileShape_MNK{});
+      constexpr int WarpM = size<0>(TmemWarpShape_MN{});
+      constexpr int DpFull = 32;
+      constexpr int M = cute::min(CtaM, DpFull * WarpM); // target 32dp tmem load
+      // Note: 
+      // Set Epi_Tile_N to 128 support OverlappingAccum for the largest tile.
+      // This is a general workable epi_tile_N which does not promise best perf.
+      return make_tile(Int<M>{}, Int<128>{}); 
+    }
+    else if constexpr (is_same_v<EpilogueTileType, EpilogueTileAuto>) {
+      constexpr int CtaM = size<0>(CtaTileShape_MNK{});
+      constexpr int CtaN = size<1>(CtaTileShape_MNK{});
+      constexpr int WarpM = size<0>(TmemWarpShape_MN{});
+      constexpr int WarpN = size<1>(TmemWarpShape_MN{});
+      constexpr int MaxBits = cute::max(sizeof_bits_v<ElementC>, sizeof_bits_v<ElementD>);
+
+      constexpr int DpFull = 32; // tmem datapaths in 1 subpartition
+      constexpr int M = cute::min(CtaM, DpFull * WarpM); // target 32dp tmem load
+      constexpr int N_perf = [&]() constexpr { // Known subtile sizes tested for perf
+        // Epilogues w/o residual load are less sensitive to smem allocation
+        // Target a fixed amount of compute per epilogue iteration
+        if (DisableSource) {
+          if (MaxBits == 4) {
+            // Make epilogue tile larger to reduce the epilogue iterations.
+            // 64 is the experimental value. It will minimize epilogue iterations but keep the number of A/B buffers the same.
+            constexpr int ComputeElts = 8192;
+            return ComputeElts / M;
+          }
+          constexpr int ComputeElts = 4096;
+          return ComputeElts / M;
+        }
+        // Epilogues w/ residual load are more sensitive to smem allocation
+        // Target optimal smem distribution between epilogue+mainloop based on datatype+tilesize
+        else {
+          if (MaxBits == 32) {
+            return (CtaM > 64 && CtaN <= 128) ? 16 : 32;
+          }
+          // Per-column scaling is high register pressure, reduce tile to prevent spills
+          else if (FusionOp::IsPerColScaleSupported) {
+            return 32;
+          }
+          else if (MaxBits == 16) {
+            return (CtaN <= 128) ? 32 : 64;
+          }
+          else {
+            return 64;
+          }
+        }
+      }();
+      constexpr int N_min_C = (DisableSource || detail::is_m_major<GmemStrideTypeC>()) ? 8 * WarpN
+                                : (sizeof_bits_v<ElementC> == 6) ? 128 * WarpN // TMA store only supports SW128B for FP6 data type
+                                                                : 128 / sizeof_bits_v<ElementC> * WarpN;
+      constexpr int N_min_D = (detail::is_m_major<GmemStrideTypeD>()) ? 8 * WarpN
+                                : (sizeof_bits_v<ElementD> == 6) ? 128 * WarpN // TMA store only supports SW128B for FP6 data type
+                                                                : 128 / sizeof_bits_v<ElementD> * WarpN;
+      constexpr int N = cute::min(CtaN, cute::max(N_perf, N_min_C, N_min_D));
+      static_assert(CtaN >= N_min_C && CtaN >= N_min_D, "CTA tile too small");
+
+      // stride by tmem warp layout and return a by-mode tiler
+      auto tile_m = Layout<Int<M>>{};
+      auto tile_n = Layout<Shape <Int<N / WarpN>,Int<        WarpN>>,
+                          Stride<Int<         1>,Int<CtaN / WarpN>>>{};
+
+      return make_tile(tile_m, coalesce(tile_n));
+    }
+    else {
+      static_assert(cute::is_tuple<EpilogueTileType>::value && not is_layout<EpilogueTileType>::value,
+                      "EpilogueTile must be a cute::Tile or cute::Shape");
+
+      EpilogueTileType epi_tile;
+      constexpr int M = size<0>(shape(epi_tile));
+      constexpr int N = size<1>(shape(epi_tile));
+      static_assert(N % 8 == 0, "Unsupported tile shape");
+
+      return epi_tile;
+    }
+  }
+  using EpilogueTile_MN = decltype(epilogue_tile());
+
+  using EpilogueTileShape_MN = decltype(product_each(shape(EpilogueTile_MN{})));
+  static constexpr int EpiTiles = size(shape_div(take<0,2>(CtaTileShape_MNK{}), EpilogueTileShape_MN{}));
+  static constexpr int FragmentSize = size(EpilogueTileShape_MN{}) / NumThreadsPerWarpGroup;
+
+  using EpilogueWarpTileShape_MN = decltype(shape_div(EpilogueTileShape_MN{}, TmemWarpShape_MN{}));
+  using AccLoadOp = decltype(detail::sm100_get_tmem_load_op<
+      GmemStrideTypeD, ElementAccumulator, ElementD, EpilogueWarpTileShape_MN, FusionOp>());
+
+  static constexpr auto
+  dispatch_policy() {
+    // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
+    constexpr bool ReuseSmem = sizeof_bits_v<ElementC_> > 8;
+    // TMA store delay performs worse with residual loads
+    constexpr bool DelayTmaStore = is_void_v<ElementC_>;
+
+    constexpr int StagesD = cute::min(EpiTiles, 2);
+    constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 4), StagesD+1)
+                                      : cute::min(EpiTiles, 4);
+
+    if constexpr (is_same_v<Schedule, PtrArrayTmaWarpSpecialized1Sm> ||
+                  is_same_v<Schedule, PtrArrayTmaWarpSpecialized2Sm>) {
+      constexpr bool DelayTmaStore_ = false; // TMA store delay complicates tensormap updates for Ptr-Array GEMMs
+      return Sm100PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore_>{};
+    }
+    else {
+      return Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
+    }
+  }
+
+  static constexpr auto
+  fusion_callbacks() {
+    {
+      return typename CallbacksBuilder<
+                        decltype(dispatch_policy()),
+                        FusionOpOrCallbacks,
+                        CtaTileShape_MNK,
+                        EpilogueTile_MN,
+                        ElementAccumulator,
+                        AccLoadOp
+                      >::Callbacks({},{});
+    }
+  }
+
+  static constexpr auto
+  gmem_load_op() {
+    if constexpr (detail::is_im2col_mode<GmemLayoutTagC>) {
+      return SM90_TMA_LOAD_IM2COL{};
+    }
+    else {
+      return SM90_TMA_LOAD{};
+    } 
+  }
+
+  static constexpr auto
+  gmem_store_op() {
+    if constexpr (detail::is_im2col_mode<GmemLayoutTagD>) {
+      return SM90_TMA_STORE_IM2COL{};
+    }
+    else {
+      return SM90_TMA_STORE{};
+    } 
+  }
+
+  static constexpr auto
+  register_shuffle_op() {
+    using namespace cute;
+
+    [[maybe_unused]] constexpr bool is_m_major = cutlass::detail::is_major<0>(GmemStrideTypeD{});
+    [[maybe_unused]] constexpr bool is_n_major = cutlass::detail::is_major<1>(GmemStrideTypeD{});
+    static_assert(is_m_major || is_n_major, "Unsupported gmem layout");
+
+    if constexpr (sizeof_bits_v<InternalSmemElementD> == 4 && is_m_major) {
+      return SM50_Shuffle_U32_2x2Trans_XOR1{};
+    }
+    else {
+      return AutoVectorizingCopyWithAssumedAlignment<128>{};
+    }
+  }
+
+public:
   using CollectiveOp =
     cutlass::epilogue::collective::CollectiveEpilogue<
-      DispatchPolicy,
+      decltype(dispatch_policy()),
       CtaTileShape_MNK,
       EpilogueTile_MN,
       ElementC_, // Need to pass void through to expose via GemmUniversal
       GmemStrideTypeC,
       ElementD,
       GmemStrideTypeD,
-      FusionCallbacks,
+      decltype(fusion_callbacks()),
       AccLoadOp,
-      CopyOpG2S,
+      decltype(gmem_load_op()),
       decltype(detail::sm100_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, InternalSmemElementC, EpilogueTile_MN>()),
       decltype(detail::sm100_get_smem_load_op<GmemStrideTypeC, InternalSmemElementC, ElementAccumulator, AccLoadOp>()),
-      CopyOpS2G,
+      decltype(gmem_store_op()),
       decltype(detail::sm100_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, InternalSmemElementD, EpilogueTile_MN>()),
       decltype(detail::sm100_get_smem_store_op<GmemStrideTypeD, InternalSmemElementD, ElementAccumulator, AccLoadOp>()),
-      decltype(detail::sm100_get_register_transform_op<GmemStrideTypeD, InternalSmemElementD>())
+      decltype(register_shuffle_op())
     >;
 };
 
@@ -702,7 +701,8 @@ struct Sm100TmaBuilderImpl {
 
 // No smem builder
 template <
-  class CtaTileShape_MNK,
+  class OpClass,
+  class MmaTileShape_MNK,
   class ClusterShape_MNK,
   class EpilogueTileType,
   class ElementAccumulator,
@@ -718,8 +718,8 @@ template <
 >
 struct CollectiveBuilder<
     arch::Sm100,
-    arch::OpClassTensorOp,
-    CtaTileShape_MNK,
+    OpClass,
+    MmaTileShape_MNK,
     ClusterShape_MNK,
     EpilogueTileType,
     ElementAccumulator,
@@ -732,11 +732,16 @@ struct CollectiveBuilder<
     AlignmentD,
     EpilogueScheduleType,
     FusionOpOrCallbacks,
-    cute::enable_if_t<cute::is_same_v<EpilogueScheduleType, NoSmemWarpSpecialized> ||
-                      cute::is_same_v<EpilogueScheduleType, PtrArrayNoSmemWarpSpecialized> >> {
+    cute::enable_if_t<is_base_of_v<NoSmemWarpSpecialized1Sm, EpilogueScheduleType> ||
+                      is_base_of_v<NoSmemWarpSpecialized2Sm, EpilogueScheduleType> >
+> {
+private:
+  static_assert(cute::sizeof_bits_v<ElementD> != 6, "Output element requires TMA");
 
-  static_assert(cute::is_same_v<EpilogueTileType, EpilogueTileAuto>, "Epilogue subtiling requires smem");
-  static_assert(cute::sizeof_bits_v<ElementD> != 4 and cute::sizeof_bits_v<ElementD> != 6, "Output element requires smem");
+  static constexpr bool Is1SmMma = is_base_of_v<NoSmemWarpSpecialized1Sm, EpilogueScheduleType>;
+  static constexpr bool Is2SmMma = is_base_of_v<NoSmemWarpSpecialized2Sm, EpilogueScheduleType>;
+  static_assert(Is1SmMma ^ Is2SmMma, "unsupported schedule");
+  static_assert(not (Is2SmMma && size<0>(ClusterShape_MNK{}) % 2 == 1), "schedule + cluster mismatch");
 
   static constexpr bool DisableSource = cute::is_void_v<ElementC_>;
   using ElementC = cute::conditional_t<DisableSource, ElementD, ElementC_>; // prevents void ref breakages
@@ -744,173 +749,110 @@ struct CollectiveBuilder<
   using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
   using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
 
-  using FusionOp = conditional_t<is_base_of_v<epilogue::fusion::FusionOperation, FusionOpOrCallbacks>,
-                                  FusionOpOrCallbacks, epilogue::fusion::FusionOperation>;
+  static constexpr bool IsTaggedFusionOp = is_base_of_v<epilogue::fusion::FusionOperation, FusionOpOrCallbacks>;
+  using FusionOp = conditional_t<IsTaggedFusionOp, FusionOpOrCallbacks, epilogue::fusion::FusionOperation>;
 
-  // use a 4x2 division to select tmem load shape in order to maintain compatability with both (4,1) and (2,2) layouts
-  using EpilogueTile = decltype(take<0,2>(CtaTileShape_MNK{}));
-  using EpilogueWarpTileShape_MN = decltype(shape_div(EpilogueTile{}, Shape<_4,_2>{}));
+  static constexpr auto
+  cta_tile_shape() {
+    if constexpr (Is2SmMma) { // 2x1 threadblock shape
+      auto [mma_tile_m, mma_tile_n, mma_tile_k] = MmaTileShape_MNK{};
+      auto cta_tile_m = reverse(shape_div(reverse(mma_tile_m), _2{})); // first MmaTile_M/2 elements, preserve multimode
+      return make_shape(cta_tile_m, mma_tile_n, mma_tile_k);
+    }
+    else { // 1x1 threadblock shape
+      return MmaTileShape_MNK{};
+    }
+  }
+  using CtaTileShape_MNK = decltype(cta_tile_shape());
+
+  static constexpr auto
+  tmem_warps() {
+    if constexpr (Is2SmMma && size<0>(MmaTileShape_MNK{}) == 128) {
+      return Shape<_2,_2>{};
+    }
+    else {
+      return Shape<_4,_1>{};
+    }
+  }
+  using TmemWarpShape_MN = decltype(tmem_warps());
+
+  static constexpr auto
+  epilogue_tile() {
+    using namespace cute;
+    if constexpr (not is_same_v<EpilogueTileType, EpilogueTileAuto>) {
+      static_assert(is_tuple_v<EpilogueTileType>, "Shape or Tile");
+      return EpilogueTileType{};
+    }
+    else if constexpr (is_same_v<OpClass,arch::OpClassBlockScaledTensorOp>) { // perf specialized case
+      constexpr int EpiM = size<0>(CtaTileShape_MNK{});
+      constexpr int EpiN = cute::min(_64{}, size<1>(CtaTileShape_MNK{}));
+      return Shape<Int<EpiM>, Int<EpiN>>{};
+    }
+    else {
+      return take<0,2>(CtaTileShape_MNK{});
+    }
+  }
+  using EpilogueTile = decltype(epilogue_tile());
+
+  using EpilogueWarpTileShape_MN = decltype(shape_div(EpilogueTile{}, TmemWarpShape_MN{}));
   using AccLoadOp = decltype(detail::sm100_get_tmem_load_op<
       GmemStrideTypeD, ElementAccumulator, ElementD, EpilogueWarpTileShape_MN, FusionOp>());
+  static constexpr int FragmentSize = size(EpilogueTile{}) / NumThreadsPerWarpGroup;
 
-  using DispatchPolicy = cutlass::epilogue::Sm100NoSmemWarpSpecialized;
+  static constexpr auto
+  dispatch_policy() {
+    if constexpr (is_same_v<EpilogueScheduleType, PtrArrayNoSmemWarpSpecialized1Sm> ||
+                  is_same_v<EpilogueScheduleType, PtrArrayNoSmemWarpSpecialized2Sm>) {
+      return Sm100PtrArrayNoSmemWarpSpecialized{};
+    }
+    else {
+      return Sm100NoSmemWarpSpecialized{};
+    }
+  }
+  using DispatchPolicy = decltype(dispatch_policy());
 
-  using AlignmentCType = Int<AlignmentC>;
-  using AlignmentDType = Int<AlignmentD>;
+  static constexpr auto
+  fusion_callbacks() {
+    constexpr thread::ScaleType::Kind ScaleType =
+      DisableSource ? thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
+    if constexpr (IsDefaultFusionOp<FusionOp>::value && not is_same_v<OpClass,arch::OpClassBlockScaledTensorOp>) {
+      // Legacy codepath using thread::LinearCombination, do not expect this to be stable
+      return thread::LinearCombination<
+                ElementD, 1, ElementAccumulator, ElementCompute, ScaleType, FusionOp::RoundStyle, ElementC>({});
+    }
+    else {
+      return typename detail::CallbacksBuilder<
+                DispatchPolicy,
+                FusionOpOrCallbacks,
+                CtaTileShape_MNK,
+                EpilogueTile,
+                ElementAccumulator,
+                AccLoadOp
+              >::Callbacks({},{});
+    }
+  }
 
-  static constexpr FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest;
-  static constexpr thread::ScaleType::Kind ScaleType = DisableSource ?
-      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
-
-  using FusionCallbacks = cute::conditional_t<
-    IsDefaultFusionOp<FusionOp>::value,
-    // Legacy codepath using thread::LinearCombination, do not expect this to be stable
-    thread::LinearCombination<
-      ElementD, 1, ElementAccumulator, ElementCompute,
-      ScaleType, RoundStyle, ElementC>
-    ,
-    typename detail::CallbacksBuilder<
+public:
+  using CollectiveOp = 
+    cutlass::epilogue::collective::CollectiveEpilogue<
       DispatchPolicy,
-      FusionOpOrCallbacks,
-      CtaTileShape_MNK,
-      EpilogueTile,
-      ElementAccumulator,
-      AccLoadOp
-    >::Callbacks
-  >;
-
-  using CollectiveOp = cute::conditional_t<
-    cute::is_same_v<EpilogueScheduleType, NoSmemWarpSpecialized>,
-    cutlass::epilogue::collective::CollectiveEpilogue<
-      cutlass::epilogue::Sm100NoSmemWarpSpecialized,
       EpilogueTile,
       ElementC_,
       GmemStrideTypeC,
       ElementD,
       GmemStrideTypeD,
-      FusionCallbacks,
+      decltype(fusion_callbacks()),
       AccLoadOp,
-      AlignmentCType,
-      AlignmentDType
-    >,
-    cutlass::epilogue::collective::CollectiveEpilogue<
-      cutlass::epilogue::Sm100PtrArrayNoSmemWarpSpecialized,
-      EpilogueTile,
-      ElementC_,
-      GmemStrideTypeC,
-      ElementD,
-      GmemStrideTypeD,
-      FusionCallbacks,
-      AccLoadOp
-    >
-  >;
-};
-
-// No smem builder for OpClassBlockScaledTensorOp
-template <
-  class CtaTileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC_,
-  class GmemLayoutTagC_,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class EpilogueScheduleType,
-  class FusionOp
->
-struct CollectiveBuilder<
-    arch::Sm100,
-    arch::OpClassBlockScaledTensorOp,
-    CtaTileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC_,
-    GmemLayoutTagC_,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    EpilogueScheduleType,
-    FusionOp,
-    cute::enable_if_t<cute::is_same_v<EpilogueScheduleType, NoSmemWarpSpecialized> ||
-                      cute::is_same_v<EpilogueScheduleType, PtrArrayNoSmemWarpSpecialized> >> {
-
-  static_assert(cute::sizeof_bits_v<ElementD> != 6, "Output element requires smem");
-  
-  static constexpr bool DisableSource = cute::is_void_v<ElementC_>;
-  using ElementC = cute::conditional_t<DisableSource, ElementD, ElementC_>; // prevents void ref breakages
-  using GmemLayoutTagC = cute::conditional_t<DisableSource, GmemLayoutTagD, GmemLayoutTagC_>;
-  static constexpr thread::ScaleType::Kind ScaleType = DisableSource ?
-      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
-  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
-  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
-
-  static_assert(cute::is_tuple<EpilogueTileType>::value || cute::is_same_v<EpilogueTileType, EpilogueTileAuto>);
-  using EpilogueTile = cute::conditional_t<cute::is_same_v<EpilogueTileType, EpilogueTileAuto>,
-                          cute::Shape<_128, _64>,
-                          EpilogueTileType           
-                        >;
-
-  using EpilogueWarpTileShape_MN = decltype(shape_div(EpilogueTile{}, Shape<_4,_1>{}));
-  using AccLoadOp = decltype(detail::sm100_get_tmem_load_op<
-      GmemStrideTypeD, ElementAccumulator, ElementD, EpilogueWarpTileShape_MN, FusionOp>());
-
-  using DispatchPolicy = cutlass::epilogue::Sm100NoSmemWarpSpecialized;
-
-  using AlignmentCType = Int<AlignmentC>;
-  using AlignmentDType = Int<AlignmentD>;
-
-  static constexpr FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest;
-
-  static_assert(is_base_of_v<fusion::FusionOperation, FusionOp>, "only support EVT fusions");
-  using FusionCallbacks =
-    typename detail::CallbacksBuilder<
-      DispatchPolicy,
-      FusionOp,
-      CtaTileShape_MNK,
-      EpilogueTile,
-      ElementAccumulator,
-      AccLoadOp
-    >::Callbacks;
-
-  using CollectiveOp = cute::conditional_t<
-    cute::is_same_v<EpilogueScheduleType, NoSmemWarpSpecialized>,
-    cutlass::epilogue::collective::CollectiveEpilogue<
-      cutlass::epilogue::Sm100NoSmemWarpSpecialized,
-      EpilogueTile,
-      ElementC_,
-      GmemStrideTypeC,
-      ElementD,
-      GmemStrideTypeD,
-      FusionCallbacks,
-      AccLoadOp,
-      AlignmentCType,
-      AlignmentDType
-    >,
-    cutlass::epilogue::collective::CollectiveEpilogue<
-      cutlass::epilogue::Sm100PtrArrayNoSmemWarpSpecialized,
-      EpilogueTile,
-      ElementC_,
-      GmemStrideTypeC,
-      ElementD,
-      GmemStrideTypeD,
-      FusionCallbacks,
-      AccLoadOp
-    >
-  >;
+      Int<AlignmentC>,
+      Int<AlignmentD>
+    >;
 };
 
 // TMA epilogue builder
 template <
   class OpClass,
-  class CtaTileShape_MNK,    // Static CTA tile shape
-  class ClusterShape_MNK,    // Static cluster shape or dynamic (int, int, _1)
+  class MmaTileShape_MNK,
+  class ClusterShape_MNK,
   class EpilogueTileType,
   class ElementAccumulator,
   class ElementCompute,
@@ -926,7 +868,7 @@ template <
 struct CollectiveBuilder<
     arch::Sm100,
     OpClass,
-    CtaTileShape_MNK,
+    MmaTileShape_MNK,
     ClusterShape_MNK,
     EpilogueTileType,
     ElementAccumulator,
@@ -940,30 +882,20 @@ struct CollectiveBuilder<
     EpilogueScheduleType,
     FusionOp,
     cute::enable_if_t<
-      // OpClass
-      ( cute::is_same_v<OpClass, arch::OpClassTensorOp>
-        || cute::is_same_v<OpClass, arch::OpClassBlockScaledTensorOp>
-      ) &&
-      // Epilogue Schedule Type
-      ( cute::is_base_of_v<TmaWarpSpecialized1Sm, EpilogueScheduleType> ||
-        cute::is_base_of_v<TmaWarpSpecialized2Sm, EpilogueScheduleType>
-        || detail::IsPtrArrayDispatchPolicy<EpilogueScheduleType>
-      )>>
+      // Only support TensorOp kernels
+      not cute::is_same_v<OpClass, arch::OpClassSimt> &&
+      (cute::is_base_of_v<TmaWarpSpecialized1Sm, EpilogueScheduleType> ||
+       cute::is_base_of_v<TmaWarpSpecialized2Sm, EpilogueScheduleType>)
+    >
+>
  {
-private:
-  using TmemWarpShape_MN = cute::conditional_t<size<0>(CtaTileShape_MNK{}) == 64 &&
-                                               (cute::is_base_of_v<TmaWarpSpecialized2Sm, EpilogueScheduleType>
-                                               || cute::is_same_v<EpilogueScheduleType, PtrArrayTmaWarpSpecialized2Sm>
-                                               ),
-                                               Shape<_2,_2>, Shape<_4,_1>>;
-
 public:
   using CollectiveOp =
     typename detail::Sm100TmaBuilderImpl<
       OpClass,
-      CtaTileShape_MNK,
+      MmaTileShape_MNK,
+      ClusterShape_MNK,
       EpilogueTileType,
-      TmemWarpShape_MN,
       ElementAccumulator,
       ElementCompute,
       ElementC,
@@ -977,11 +909,11 @@ public:
     >::CollectiveOp;
 };
 
-// Auto builder
+// Auto epilogue builder for TensorOp kernels
 template <
   class OpClass,
-  class CtaTileShape_MNK,     // Static CTA tile shape
-  class ClusterShape_MNK,     // Static cluster shape or dynamic (int, int, _1)
+  class MmaTileShape_MNK,
+  class ClusterShape_MNK,
   class EpilogueTileType,
   class ElementAccumulator,
   class ElementCompute,
@@ -991,13 +923,12 @@ template <
   class ElementD,
   class GmemLayoutTagD,
   int AlignmentD,
-  class EpilogueScheduleType,
   class FusionOp
 >
 struct CollectiveBuilder<
     arch::Sm100,
     OpClass,
-    CtaTileShape_MNK,
+    MmaTileShape_MNK,
     ClusterShape_MNK,
     EpilogueTileType,
     ElementAccumulator,
@@ -1008,30 +939,41 @@ struct CollectiveBuilder<
     ElementD,
     GmemLayoutTagD,
     AlignmentD,
-    EpilogueScheduleType,
+    EpilogueScheduleAuto,
     FusionOp,
-    cute::enable_if_t<
-      // OpClass
-      ( cute::is_same_v<OpClass, arch::OpClassTensorOp>
-        || cute::is_same_v<OpClass, arch::OpClassBlockScaledTensorOp>
-      )
-      // Epilogue Schedule Type
-      && cute::is_same_v<EpilogueScheduleType, EpilogueScheduleAuto>>
+    // only for TensorOp kernels
+    cute::enable_if_t<not cute::is_same_v<OpClass, arch::OpClassSimt>>
 >
  {
 private:
-  static_assert(cute::is_same_v<EpilogueTileType, EpilogueTileAuto>, "Don't specify epilogue tile with auto schedule");
-  using TmemWarpShape_MN = cute::conditional_t<size<0>(CtaTileShape_MNK{}) == 64 &&
-                                               size<0>(ClusterShape_MNK{}) % 2 == 0
-                                               ,
-                                              Shape<_2,_2>, Shape<_4,_1>>;
+  static constexpr bool
+  is_2sm() {
+    using namespace cute;
+    constexpr int MmaTileM = size<0>(MmaTileShape_MNK{});
+    constexpr int ClusterM = size<0>(ClusterShape_MNK{});
+    constexpr bool StaticClusterM = is_static_v<decltype(get<0>(ClusterShape_MNK{}))>;
+    constexpr bool EvenClusterM = StaticClusterM && ClusterM % 2 == 0;
+    if constexpr (not EvenClusterM) {
+      return false;
+    }
+    else if constexpr (is_same_v<OpClass,arch::OpClassBlockScaledTensorOp>) {
+      return MmaTileM == 256;
+    }
+    else {
+      return MmaTileM == 256 || MmaTileM == 128;
+    }
+  }
+  using EpilogueSchedule = cute::conditional_t<is_2sm(), TmaWarpSpecialized2Sm, TmaWarpSpecialized1Sm>;
+
 public:
+  static_assert(cute::is_same_v<EpilogueTileType, EpilogueTileAuto>, "Don't specify epilogue tile with auto schedule");
   using CollectiveOp =
-    typename detail::Sm100TmaBuilderImpl<
+    typename CollectiveBuilder<
+      arch::Sm100,
       OpClass,
-      CtaTileShape_MNK,
+      MmaTileShape_MNK,
+      ClusterShape_MNK,
       EpilogueTileType,
-      TmemWarpShape_MN,
       ElementAccumulator,
       ElementCompute,
       ElementC,
@@ -1040,7 +982,7 @@ public:
       ElementD,
       GmemLayoutTagD,
       AlignmentD,
-      EpilogueScheduleType,
+      EpilogueSchedule,
       FusionOp
     >::CollectiveOp;
 };
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp
index c1b06b06..80eea5e2 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp
@@ -356,24 +356,21 @@ public:
     }
 
     // Represent the full output tensor, slice to get the tile this CTA is responsible for
-    Tensor mC = make_tensor(make_gmem_ptr(ptr_C_l), problem_shape_mnl, append<3>(params.dC,_0{}));      // (M,N,L)
-    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), problem_shape_mnl, append<3>(params.dD,_0{}));      // (M,N,L)
-    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                               // (CTA_M,CTA_N)
-    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                               // (CTA_M,CTA_N)
-    Tensor gC_epi   = flat_divide(  gC, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor gD_epi   = flat_divide(  gD, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor mC = make_tensor(make_gmem_ptr(ptr_C_l), problem_shape_mnl, append<3>(params.dC,_0{}));           // (M,N,L)
+    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), problem_shape_mnl, append<3>(params.dD,_0{})); // (M,N,L)
+    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
 
 
     // Partition source and destination tiles according to tmem copy T2R partitioning (tTR_)
     auto thread_t2r = tiled_t2r.get_slice(threadIdx.x % size(tiled_t2r));
-    Tensor tTR_gC   = thread_t2r.partition_D(gC_epi);                                               // (T2R,T2R_M,T2R_N)
-    Tensor tTR_gD   = thread_t2r.partition_D(gD_epi);                                               // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gC   = thread_t2r.partition_D(gC);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gD   = thread_t2r.partition_D(gD);                                                  // (T2R,T2R_M,T2R_N)
  
 
-    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                       // (M,N,L) -> (m,n,l)
-    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                                // (CTA_M,CTA_N) -> (m,n,l)
-    Tensor cD_epi   = flat_divide(  cD, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor tTR_cD = thread_t2r.partition_D(cD);                                          // (T2R,T2R_M,T2R_N) -> (m,n,l)
+    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                      // (M,N,L) -> (m,n,l)
+    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                               // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor tTR_cD = thread_t2r.partition_D(cD);                                         // (T2R,T2R_M,T2R_N) -> (m,n,l)
 
     // 2. Apply element-wise operation and store to gmem
     // source is needed
@@ -410,7 +407,9 @@ template <
   class ElementD_,
   class StrideD_,
   class ThreadEpilogueOp_,
-  class CopyOpT2R_
+  class CopyOpT2R_,
+  class AlignmentC,
+  class AlignmentD
 >
 class CollectiveEpilogue<
     Sm100PtrArrayNoSmemWarpSpecialized,
@@ -420,7 +419,9 @@ class CollectiveEpilogue<
     ElementD_,
     StrideD_,
     ThreadEpilogueOp_,
-    CopyOpT2R_
+    CopyOpT2R_,
+    AlignmentC,
+    AlignmentD
 > : public detail::Sm100TmaWarpSpecializedAdapter<CollectiveEpilogue<
       Sm100PtrArrayNoSmem,
       EpilogueTile_,
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
index 85ede703..cb9cc606 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
@@ -372,24 +372,21 @@ public:
     auto cta_tiler = take<0,2>(cta_tile_shape_mnk);
 
     // Represent the full output tensor, slice to get the tile this CTA is responsible for
-    Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), problem_shape_mnl, append<3>(params.dC,_0{}));       // (M,N,L)
-    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D), problem_shape_mnl, append<3>(params.dD,_0{}));       // (M,N,L)
-    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                               // (CTA_M,CTA_N)
-    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                               // (CTA_M,CTA_N)
-    Tensor gC_epi   = flat_divide(  gC, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor gD_epi   = flat_divide(  gD, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), problem_shape_mnl, append<3>(params.dC,_0{})); // (M,N,L)
+    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D), problem_shape_mnl, append<3>(params.dD,_0{}));      // (M,N,L)
+    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
 
 
     // Partition source and destination tiles according to tmem copy T2R partitioning (tTR_)
     auto thread_t2r = tiled_t2r.get_slice(threadIdx.x % size(tiled_t2r));
-    Tensor tTR_gC   = thread_t2r.partition_D(gC_epi);                                               // (T2R,T2R_M,T2R_N)
-    Tensor tTR_gD   = thread_t2r.partition_D(gD_epi);                                               // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gC   = thread_t2r.partition_D(gC);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gD   = thread_t2r.partition_D(gD);                                                  // (T2R,T2R_M,T2R_N)
  
 
-    Tensor coordCD = make_identity_tensor(problem_shape_mnl);                                       // (M,N,L) -> (m,n,l)
-    Tensor cCD = local_tile(coordCD, cta_tiler, cta_coord_mnl);                                // (CTA_M,CTA_N) -> (m,n,l)
-    Tensor cD_epi   = flat_divide(  cCD, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor tTR_cCD = thread_t2r.partition_D(cCD);                                          // (T2R,T2R_M,T2R_N) -> (m,n,l)
+    Tensor coordCD = make_identity_tensor(problem_shape_mnl);                                     // (M,N,L) -> (m,n,l)
+    Tensor cCD = local_tile(coordCD, cta_tiler, cta_coord_mnl);                             // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor tTR_cCD = thread_t2r.partition_D(cCD);                                       // (T2R,T2R_M,T2R_N) -> (m,n,l)
 
     // 2. Apply element-wise operation and store to gmem
     ThreadEpilogueOp epilogue_op{params.thread};
@@ -587,18 +584,18 @@ public:
 
     int thread_idx = threadIdx.x % ThreadCount;
 
-    Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                              // (CTA_M,CTA_N)
-    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                             // (CTA_M,CTA_N)
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
     TiledCopy tiled_t2r = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
     ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
-    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc_epi);                                 // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc_epi);                                // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
 
     constexpr int FragmentSize = size(EpilogueTile{}) / ThreadCount;
 
-    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                       // (M,N,L) -> (m,n,l)
-    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                                // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                      // (M,N,L) -> (m,n,l)
+    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                               // (CTA_M,CTA_N) -> (m,n,l)
     Tensor cD_epi = flat_divide(cD, EpilogueTile{});
-    Tensor tTR_cD = thread_t2r.partition_D(cD_epi);                                      // (T2R,T2R_M,T2R_N) -> (m,n,l)
+    Tensor tTR_cD = thread_t2r.partition_D(cD_epi);                                     // (T2R,T2R_M,T2R_N) -> (m,n,l)
 
     Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_cD(_,_,_,_0{},_0{})));
 
@@ -689,19 +686,22 @@ public:
             do_acc_release = iter_m == size<3>(tTR_tAcc)-1 && iter_n == 0;
           }
 
-        Tensor tTR_cCD_mn = tTR_cCD(_,_,_,epi_m,epi_n);
+          Tensor tTR_cCD_mn = tTR_cCD(_,_,_,epi_m,epi_n);
           cst_callbacks.begin_loop(epi_m, epi_n);
 
-          if (is_C_load_needed) {
-            Tensor tTR_cC_frag = tensor<1>(zipped_divide(coalesce(tTR_cCD_mn), mclC.compose(Int<VC>{})));
-            Tensor tTR_gC_frg = recast<Array<GmemElementC, VC>>(coalesce(tTR_gC(_,_,_,epi_m,epi_n)));
-            Tensor tTR_rC_frg = recast<Array<GmemElementC, VC>>(coalesce(tCrC));
+          if constexpr (not cute::is_void_v<ElementC>) {
+            if (is_C_load_needed) {
+              using CVecType = uint_bit_t<VC * sizeof_bits_v<ElementC>>;
+              Tensor tTR_cC_frag = tensor<1>(zipped_divide(coalesce(tTR_cCD_mn), mclC.compose(Int<VC>{})));
 
-            auto pred_fn_C = [&] (auto const&... coords) {
-              return elem_less(tTR_cC_frag(coords...), problem_shape_mnl);
-            };
+              auto pred_fn_C = [&] (auto const&... coords) CUTLASS_LAMBDA_FUNC_INLINE {
+                return elem_less(tTR_cC_frag(coords...), problem_shape_mnl);
+              };
 
-            copy_if(pred_fn_C, tTR_gC_frg, tTR_rC_frg);
+                Tensor tTR_gC_frg = recast<CVecType>(coalesce(tTR_gC(_,_,_,epi_m,epi_n)));
+                Tensor tTR_rC_frg = recast<CVecType>(coalesce(tCrC));
+                copy_if(pred_fn_C, tTR_gC_frg, tTR_rC_frg);
+            }
           }
 
           // Copy accumulator tile from tmem to register
@@ -733,17 +733,15 @@ public:
 
           
           Tensor tTR_cD_frag = tensor<1>(zipped_divide(coalesce(tTR_cCD_mn), mclD.compose(Int<VD>{})));
-          
-          using VecType = uint_bit_t<VD * sizeof_bits_v<ElementD>>;
-          Tensor tTR_gD_frg = recast<VecType>(coalesce(tTR_gD(_,_,_,epi_m,epi_n)));
-          Tensor tTR_rD_frg = recast<VecType>(coalesce(tTR_rD));
-
           auto pred_fn_D = [&] (auto const&... coords) CUTLASS_LAMBDA_FUNC_INLINE {
             return elem_less(tTR_cD_frag(coords...), problem_shape_mnl);
           };
 
-          copy_if(pred_fn_D, tTR_rD_frg, tTR_gD_frg);
+          using VecType = uint_bit_t<VD * sizeof_bits_v<ElementD>>;
+            Tensor tTR_gD_frg = recast<VecType>(coalesce(tTR_gD(_,_,_,epi_m,epi_n)));
+            Tensor tTR_rD_frg = recast<VecType>(coalesce(tTR_rD));
 
+            copy_if(pred_fn_D, tTR_rD_frg, tTR_gD_frg);
         } // for epi_m
       } // for epi_n
 
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
index f8c5b287..c3893675 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -340,7 +340,7 @@ public:
           _1{});
     }
 
-    typename Params::TMA_D tma_store_d;
+    typename Params::TMA_D tma_store_d{};
     if constexpr (is_destination_supported) {
       ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(args.ptr_D);
       Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
index f13a6b6f..83302627 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -287,7 +287,7 @@ public:
           EpilogueTile{});
     }
 
-    typename Params::TMA_D tma_store_d;
+    typename Params::TMA_D tma_store_d{};
     if constexpr (is_destination_supported) {
       Tensor tensor_d = make_tensor(make_gmem_ptr<TmaElementD>(args.ptr_D), make_layout(make_shape(M,N,L), args.dD));
       tma_store_d = make_tma_copy_C_sm90(
diff --git a/include/cutlass/epilogue/dispatch_policy.hpp b/include/cutlass/epilogue/dispatch_policy.hpp
index be1ff675..bd083c80 100644
--- a/include/cutlass/epilogue/dispatch_policy.hpp
+++ b/include/cutlass/epilogue/dispatch_policy.hpp
@@ -44,35 +44,30 @@ namespace cutlass::epilogue {
 // Builder Epilogue Schedules
 //
 //////////////////////////////////////////////////////////////////////////////
-
+// Pre-Hopper schedules
 struct PtrArrayDefault {};
 struct EpilogueSimtVectorized {};
 struct EpiloguePtrArraySimtVectorized {};
+// Hopper direct store schedules
 struct NoSmemWarpSpecialized {};
 struct PtrArrayNoSmemWarpSpecialized {};
 struct PtrArrayNoSmemWarpSpecializedTransposed {};
+// Hopper TMA schedules
 struct TmaWarpSpecialized {};
 struct TmaWarpSpecializedCooperative {};
-
+struct PtrArrayTmaWarpSpecialized { static constexpr int NumEpilogueWarpGroups = 1; };
+struct PtrArrayTmaWarpSpecializedPingpong { static constexpr int NumEpilogueWarpGroups = 2; };
+struct PtrArrayTmaWarpSpecializedCooperative { static constexpr int NumEpilogueWarpGroups = 2; };
+// Blackwell direct store schedules
+struct NoSmemWarpSpecialized1Sm {};
+struct NoSmemWarpSpecialized2Sm {};
+struct PtrArrayNoSmemWarpSpecialized1Sm : NoSmemWarpSpecialized1Sm {};
+struct PtrArrayNoSmemWarpSpecialized2Sm : NoSmemWarpSpecialized2Sm {};
+// Blackwell TMA schedules 
 struct TmaWarpSpecialized1Sm {};
 struct TmaWarpSpecialized2Sm {};
-struct PtrArrayTmaWarpSpecialized1Sm {};
-struct PtrArrayTmaWarpSpecialized2Sm {};
-
-struct PtrArrayTmaWarpSpecializedCooperative {
-  static constexpr int NumEpilogueWarpGroups = 2;
-};
-
-// Standard warp specialized epilogue
-struct PtrArrayTmaWarpSpecialized {
-  static constexpr int NumEpilogueWarpGroups = 1;
-};
-
-// Pingpong kernel epilogue
-struct PtrArrayTmaWarpSpecializedPingpong {
-  static constexpr int NumEpilogueWarpGroups = 2;
-};
-
+struct PtrArrayTmaWarpSpecialized1Sm : TmaWarpSpecialized1Sm {};
+struct PtrArrayTmaWarpSpecialized2Sm : TmaWarpSpecialized2Sm {};
 // DEPRECATED schedules, will be removed in next release
 struct TmaWarpSpecializedElementwiseBase : public TmaWarpSpecialized {};
 struct TmaWarpSpecializedCooperativeElementwiseBase : public TmaWarpSpecializedCooperative {};
diff --git a/include/cutlass/epilogue/fusion/operations.hpp b/include/cutlass/epilogue/fusion/operations.hpp
index fcd9fc56..8cac28f7 100644
--- a/include/cutlass/epilogue/fusion/operations.hpp
+++ b/include/cutlass/epilogue/fusion/operations.hpp
@@ -53,6 +53,7 @@ struct FusionOperation {
   // metadata types/queries that can be overrided
   using ElementOutput = void;
   using ElementCompute = void;
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_indeterminate;
 
   using ElementSource = void;
   static constexpr bool IsSourceSupported = false;
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index ad8f5651..7abed263 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -482,7 +482,6 @@ public:
 /// Note: The below method only when problem_size_K <= 256 for signed int8 gemm
 /// or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
 /// above.
-/// TODO: Add logic to fallback to the default approach
 template <
     /// Data type used to load and store< tensors
     typename ElementOutput_,
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index a725a889..279c3aa6 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -39,13 +39,8 @@
 #include <type_traits>
 #endif
 #if !defined(__QNX__)
-#include <cuda/std/version>
-#if defined(_MSC_VER) && defined(CCCL_VERSION) && CCCL_VERSION >= 2008000
-#include <cuda/std/__utility/swap.h>
-#else
 #include <cuda/std/utility>
 #endif
-#endif
 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/uint128.h"
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 5d3d6fca..ecbcdff2 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -51,18 +51,57 @@
 #ifdef _MSC_VER
 // Provides support for alternate operators such as 'and', 'or', ...
 #include <ciso646>
+#include <intrin.h>
 #endif // _MSC_VER
 
-
 #if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
 #  define CUTLASS_ARCH_CREDUX_ENABLED
 #endif
 
-
 namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace detail {
+
+  CUTLASS_HOST_DEVICE int32_t popcount(int32_t x) {
+    #if defined(__CUDA_ARCH__)
+    return __popc(x);
+    #elif defined(__GNUC__) || defined(__clang__)
+    return __builtin_popcount(x);
+    #elif defined(_MSC_VER)
+    return __popcnt(x);
+    #else
+    int32_t count = 0;
+    while (x) {
+      count += x & 1;
+      x >>= 1;
+    }
+    return count;
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE int64_t popcount(int64_t x) {
+    #if defined(__CUDA_ARCH__)
+    return __popcll(x);
+    #elif defined(__GNUC__) || defined(__clang__)
+    return __builtin_popcountll(x);
+    #elif defined(_MSC_VER)
+    return __popcnt64(x);
+    #else
+    int64_t count = 0;
+    while (x) {
+      count += x & 1;
+      x >>= 1;
+    }
+    return count;
+    #endif
+  }
+
+} // namespace detail
+  
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 template <typename T>
 struct absolute_value_op {
   CUTLASS_HOST_DEVICE
@@ -609,22 +648,7 @@ struct and_popc_add {
   CUTLASS_HOST_DEVICE
   C operator()(A const &a, B const &b, C const &c) const {
     A and_result = a & b;
-
-#if defined(__CUDA__ARCH__)
-    int popc_result = __popc(and_result);
-
-    if constexpr (sizeof(A) == sizeof(uint64_t)) {
-      popc_result += __popc(static_cast<uint32_t>(and_result >> 32));
-    }
-
-#else
-    int popc_result = __builtin_popcount(and_result);
-    if constexpr (sizeof(A) == sizeof(uint64_t)) {
-      popc_result += __builtin_popcount(static_cast<uint32_t>(and_result >> 32));
-    }
-
-#endif
-
+    int32_t popc_result = detail::popcount(and_result);
     return C(popc_result) + c;
   }
 };
@@ -646,22 +670,7 @@ struct xor_popc_add {
   CUTLASS_HOST_DEVICE
   C operator()(A const &a, B const &b, C const &c) const {
     A xor_result = a ^ b;
-
-#if defined(__CUDA__ARCH__)
-    int popc_result = __popc(xor_result);
-
-    if constexpr (sizeof(A) == sizeof(uint64_t)) {
-      popc_result += __popc(static_cast<uint32_t>(xor_result >> 32));
-    }
-
-#else
-    int popc_result = __builtin_popcount(xor_result);
-    if constexpr (sizeof(A) == sizeof(uint64_t)) {
-      popc_result += __builtin_popcount(static_cast<uint32_t>(xor_result >> 32));
-    }
-
-#endif
-
+    int32_t popc_result = detail::popcount(xor_result);
     return C(popc_result) + c;
   }
 };
@@ -682,22 +691,7 @@ struct or_popc_add {
   CUTLASS_HOST_DEVICE
   C operator()(A const &a, B const &b, C const &c) const {
     A or_result = a | b;
-
-#if defined(__CUDA__ARCH__)
-    int popc_result = __popc(or_result);
-
-    if constexpr (sizeof(A) == sizeof(uint64_t)) {
-      popc_result += __popc(static_cast<uint32_t>(or_result >> 32));
-    }
-
-#else
-    int popc_result = __builtin_popcount(or_result);
-    if constexpr (sizeof(A) == sizeof(uint64_t)) {
-      popc_result += __builtin_popcount(static_cast<uint32_t>(or_result >> 32));
-    }
-
-#endif
-
+    int32_t popc_result = detail::popcount(or_result);
     return C(popc_result) + c;
   }
 };
diff --git a/include/cutlass/gemm/collective/builders/sm100_common.inl b/include/cutlass/gemm/collective/builders/sm100_common.inl
index 8e53866a..9f2542b5 100644
--- a/include/cutlass/gemm/collective/builders/sm100_common.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_common.inl
@@ -567,7 +567,7 @@ sm100_make_trivial_fastFP32_tiled_mma() {
 }
 
 /**
- * @brief Check for U4_UNPACK_U8, U6_UNPACK_U8 alignment requirement
+ * @brief Check for F8F6F4 alignment requirement
  * 
  * @tparam TileShape_MNK (MmaAtomShape_M, MmaAtomShape_N, TileShape_K)
  * @tparam ClusterShape_MNK (cluster_M, cluster_N, cluster_K)
diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
index 4209fd87..7736dbee 100644
--- a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -85,7 +85,7 @@ compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes_> stage_co
 }
 
 // Returns the maximum number of smem tiles that can be used with a given smem capacity in gemm of blockwise/groupwise scale.
-template<int capacity_bytes_, class ElementA, class ElementB, class ElementBlockScale, class TileShapeMNK, int ScaleMsPerTile, int carveout_bytes_, int alignment = 128>
+template<int capacity_bytes_, class ElementA, class ElementB, class ElementBlockScale, class TileShapeMNK, int ScaleMsPerTile, int ScaleNsPerTile, int carveout_bytes_, int alignment = 128>
 constexpr int
 compute_stage_count_with_blockwise_scale(StageCountAutoCarveout<carveout_bytes_> stage_count) {
   constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
@@ -96,7 +96,7 @@ compute_stage_count_with_blockwise_scale(StageCountAutoCarveout<carveout_bytes_>
     cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
     cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
     cutlass::bits_to_bytes(scale_bits * ScaleMsPerTile) + // scale of tensor A
-    cutlass::bits_to_bytes(scale_bits * 1);               // scale of tensor B
+    cutlass::bits_to_bytes(scale_bits * ScaleNsPerTile);  // scale of tensor B
 
   constexpr int stage_bytes = cutlass::round_up(stage_bytes_, alignment) +
     static_cast<int>(mainloop_pipeline_bytes);
@@ -1043,7 +1043,8 @@ template <
   class TileShape_MNK,
   class ClusterShape_MNK,
   class StageCountType,
-  int ScaleGranularityM_
+  int ScaleGranularityM_,
+  int ScaleGranularityN_
 >
 struct CollectiveBuilder<
     arch::Sm90,
@@ -1058,11 +1059,11 @@ struct CollectiveBuilder<
     TileShape_MNK,
     ClusterShape_MNK,
     StageCountType,
-    KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_>,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_, ScaleGranularityN_>,
     cute::enable_if_t<
       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
 > {
-  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_>;
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_, ScaleGranularityN_>;
 
   static_assert(is_static<TileShape_MNK>::value);
   static_assert(is_static<ClusterShape_MNK>::value);
@@ -1090,7 +1091,7 @@ struct CollectiveBuilder<
   static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
                                                           KernelTmaWarpSpecializedCooperative,
                                                           KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_>>;
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM_, ScaleGranularityN_>>;
   using AtomLayoutMNK = cute::conditional_t<IsCooperative,
       Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
 
@@ -1109,12 +1110,15 @@ struct CollectiveBuilder<
   static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
 
   static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape_MNK{}) : ScaleGranularityM_;
+  static constexpr int ScaleGranularityN = ScaleGranularityN_ == 0 ? size<1>(TileShape_MNK{}) : ScaleGranularityN_;
   static constexpr int ScaleMsPerTile = size<0>(TileShape_MNK{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape_MNK{}) / ScaleGranularityN;
   static_assert((size<0>(TileShape_MNK{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+  static_assert((size<1>(TileShape_MNK{}) % ScaleGranularityN) == 0, "FP8 scaling granularity must evenly divide tile shape along N.");
 
   static constexpr int PipelineStages = detail::compute_stage_count_with_blockwise_scale<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
-      ElementAMma, ElementBMma, ElementBlockScale, TileShape_MNK, ScaleMsPerTile>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM_>;
+      ElementAMma, ElementBMma, ElementBlockScale, TileShape_MNK, ScaleMsPerTile, ScaleNsPerTile>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM_, ScaleGranularityN_>;
 
   using SmemCopyAtomA = void;
   using SmemCopyAtomB = void;
diff --git a/include/cutlass/gemm/collective/fp8_accumulation.hpp b/include/cutlass/gemm/collective/fp8_accumulation.hpp
index bd2a0cb2..9dff91a5 100644
--- a/include/cutlass/gemm/collective/fp8_accumulation.hpp
+++ b/include/cutlass/gemm/collective/fp8_accumulation.hpp
@@ -75,6 +75,15 @@ private:
   }
 
   // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
+  CUTLASS_DEVICE
+  void scale_core(ElementAccumulator const &scale) {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale;
+    }
+  }
+
   template <
     class EngineScale,
     class LayoutScale>
@@ -94,6 +103,31 @@ private:
     }
   }
 
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    using TensorScaleA = cute::Tensor<EngineScaleA, LayoutScaleA>;
+    using TensorScaleB = cute::Tensor<EngineScaleB, LayoutScaleB>;
+
+    static_assert(is_static<LayoutScaleA>::value, "ScaleA Layout should be static");
+    static_assert(is_static<LayoutScaleB>::value, "ScaleB Layout should be static");
+    static_assert(is_rmem<TensorScaleA>::value, "ScaleA tensor must be rmem resident.");
+    static_assert(is_rmem<TensorScaleB>::value, "ScaleB tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScaleA{}.shape(), "Accumulator and scaleA must have same shape.");
+    static_assert(LayoutAccum{}.shape() == LayoutScaleB{}.shape(), "Accumulator and scaleB must have same shape.");
+
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scaleA(i) * scaleB(i);
+    }
+  }
+
 public:
   CUTLASS_DEVICE
   GmmaFP8Accumulation(
@@ -152,6 +186,16 @@ public:
   //
 
   /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void scale_if_needed(ElementAccumulator const &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
   template <
     class EngineScale,
     class LayoutScale>
@@ -165,7 +209,29 @@ public:
     }
   }
 
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scaleA, scaleB);
+      mma_count_ = 0;
+    }
+  }
+
   /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(ElementAccumulator const &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+
   template <
     class EngineScale,
     class LayoutScale>
@@ -175,6 +241,18 @@ public:
       scale_core(scale);
     }
   }
+
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scaleA, scaleB);
+    }
+  }
 };
 
 } // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
index 65718878..fec954a5 100644
--- a/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
@@ -30,8 +30,6 @@
  **************************************************************************************************/
 
 
-
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -288,23 +286,23 @@ struct CollectiveMma<
   using TensorStorage = typename SharedStorage::TensorStorage;
   using PipelineStorage = typename SharedStorage::PipelineStorage;
 
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
   static constexpr uint32_t SFTransactionBytes =
     cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
     cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
-  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
   static constexpr uint32_t ABTmaTransactionBytes =
     cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
     cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
   static constexpr uint32_t TmaTransactionBytes = ABTmaTransactionBytes + SFTransactionBytes;
 
-  template<class AccTensor, class SfaTensor, class SfbTensor>
+  template <class AccTensor, class SfaTensor, class SfbTensor>
   struct TmemStorage {
     AccTensor accumulators;
     SfaTensor tCtSFA;
     SfbTensor tCtSFB;
   };
 
-  template<
+  template <
     class KTileCount,
     class GTensorPartitionedA, class GTensorPartitionedB,
     class STensorA, class STensorB,
@@ -348,7 +346,8 @@ struct CollectiveMma<
       , mcast_mask_sfa(mcast_mask_sfa_), mcast_mask_sfb(mcast_mask_sfb_) {}
   };
 
-  template<
+  template <
+    class TiledMma,
     class FragmentA, class FragmentB,
     class FragmentSFA, class FragmentSFB,
     class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
@@ -496,6 +495,7 @@ struct CollectiveMma<
     Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
     Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
     auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
     // Cluster layout for TMA construction
     auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
     auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
@@ -505,7 +505,7 @@ struct CollectiveMma<
 
     // Cluster layout for TMA construction of SFB
     auto cluster_layout_sfb_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMMA_SF::AtomThrID{}));
-    auto cluster_layout_sfb_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMMA_SF::AtomThrID{})); 
+    auto cluster_layout_sfb_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMMA_SF::AtomThrID{}));
 
     typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
         GmemTiledCopyA{},
@@ -649,7 +649,7 @@ struct CollectiveMma<
     return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
   }
 
-  template<class EpilogueTile, bool IsOverlappingAccum = false>
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
   CUTLASS_DEVICE static
   auto
   init_tmem_tensors(EpilogueTile epi_tile) {
@@ -660,7 +660,7 @@ struct CollectiveMma<
         tiled_mma, acc_shape, EpilogueTile{});
     Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(shape(SmemLayoutAtomSFA{}));
     Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(shape(SmemLayoutAtomSFB{}));
-  
+
     TmemStorage<decltype(accumulators), decltype(tCtSFA), decltype(tCtSFB)> tmem_storage;
     tmem_storage.accumulators = accumulators;
     tmem_storage.tCtSFA = tCtSFA;
@@ -669,10 +669,10 @@ struct CollectiveMma<
     return tmem_storage;
   }
 
-  template<class AccTensor, class SfaTensor, class SfbTensor>
+  template <class TmemStorage>
   CUTLASS_DEVICE static
   void
-  set_tmem_offsets(TmemStorage<AccTensor, SfaTensor, SfbTensor>& tmem_storage, uint32_t tmem_base_addr) {
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
     tmem_storage.accumulators.data() = tmem_base_addr;
     tmem_storage.tCtSFA.data() = tmem_storage.accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
     tmem_storage.tCtSFB.data() = tmem_storage.tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtSFA);
@@ -751,7 +751,6 @@ struct CollectiveMma<
     Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
 
     // Define the CTA-in-cluster Layout and Coord
-
     Layout cta_layout_mnk  = make_layout(cluster_shape_);
     Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
     auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
@@ -785,13 +784,11 @@ struct CollectiveMma<
     uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
     uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
 
-    LoadParams load_params {
+    return LoadParams{
       size<3>(gA_mkl),                                            // for scheduler
       tAgA_mkl, tBgB_nkl, tAsA, tBsB,                             // for input tensor values
       tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,                     // for input scale factor tensor values
-      mcast_mask_a, mcast_mask_b, mcast_mask_sfa, mcast_mask_sfb  // multicast masks
-    };
-    return load_params;
+      mcast_mask_a, mcast_mask_b, mcast_mask_sfa, mcast_mask_sfb}; // multicast masks
   }
 
   /// Set up the data needed by this collective for mma compute.
@@ -802,8 +799,8 @@ struct CollectiveMma<
     TensorStorage& shared_tensors) const {
 
     // Allocate "fragments/descriptors" for A and B matrices
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
 
     // Allocate "fragments/descriptors" for A and B matrices
     Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
@@ -854,17 +851,12 @@ struct CollectiveMma<
       tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
       tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
     }
-    MmaParams<
-      decltype(tCrA), decltype(tCrB), decltype(tCtSFA), decltype(tCtSFB),
-      decltype(tiled_copy_s2t_SFA), decltype(thr_tCsSFA_compact_s2t), decltype(thr_tCtSFA_compact_s2t),
-      decltype(tiled_copy_s2t_SFB), decltype(thr_tCsSFB_compact_s2t), decltype(thr_tCtSFB_compact_s2t)
-    > mma_params {
+
+    return MmaParams{
       tiled_mma,
       tCrA, tCrB, tCtSFA, tCtSFB,
       tiled_copy_s2t_SFA, thr_tCsSFA_compact_s2t, thr_tCtSFA_compact_s2t,
-      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t
-    };
-    return mma_params;
+      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t};
   }
 
   /// Perform a collective-scoped matrix multiply-accumulate
@@ -983,52 +975,12 @@ struct CollectiveMma<
 
     uint32_t skip_wait = k_tile_count <= 0;
     auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+    bool is_first_iter = true;
 
     //
     // PIPELINED MAIN LOOP
     //
     tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
-    if (k_tile_count > 0) { // first iteraion
-      // WAIT on mainloop_pipe_consumer_state until its data are available
-      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
-      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
-
-      // Compute on k_tile
-      int read_stage = mainloop_pipe_consumer_state.index();
-      // Save current mainlop pipeline read state
-      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
-
-      // Advance mainloop_pipe
-      ++mainloop_pipe_consumer_state;
-      --k_tile_count;
-      skip_wait = k_tile_count <= 0;
-      // Peek at next iteration
-      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
-
-      if (cute::elect_one_sync()) {
-        copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
-        copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
-      }
-
-      if constexpr (IsOverlappingAccum) {
-        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
-      }
-
-      // Unroll the K mode manually so we can set scale C to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
-                                  tCtSFA(_,_,k_block),
-                                  tCtSFB_mma(_,_,k_block)),
-            tCrA(_,_,k_block,read_stage),
-            tCrB(_,_,k_block,read_stage),
-            accumulators);
-        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
-      }
-      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
-    }
-
     CUTLASS_PRAGMA_NO_UNROLL
     while (k_tile_count > 0) {
       // WAIT on mainloop_pipe_consumer_state until its data are available
@@ -1052,6 +1004,13 @@ struct CollectiveMma<
         copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
       }
 
+      if constexpr (IsOverlappingAccum) {
+        if (is_first_iter) {
+          accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+          is_first_iter = false;
+        }
+      }
+
       // Unroll the K mode manually so we can set scale C to 1
       CUTLASS_PRAGMA_UNROLL
       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
@@ -1064,6 +1023,7 @@ struct CollectiveMma<
             accumulators);
         tiled_mma.accumulate_ = UMMA::ScaleOut::One;
       }
+
       mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
     }
 
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
index c7e56250..f1abb1eb 100644
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
@@ -31,7 +31,6 @@
 
 
 
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -239,12 +238,12 @@ struct CollectiveMma<
     cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
     cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
 
-  template<class AccTensor>
+  template <class AccTensor>
   struct TmemStorage {
     AccTensor accumulators;
   };
 
-  template<
+  template <
     class KTileCount,
     class GTensorPartitionedA, class GTensorPartitionedB,
     class STensorA, class STensorB
@@ -273,7 +272,10 @@ struct CollectiveMma<
     , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_) {}
   };
 
-  template<class FragmentA, class FragmentB>
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB
+  >
   struct MmaParams {
     TiledMma tiled_mma;
     FragmentA tCrA;
@@ -336,7 +338,7 @@ struct CollectiveMma<
     , runtime_data_type_a_(params.runtime_data_type_a)
     , runtime_data_type_b_(params.runtime_data_type_b) {
     if constexpr (IsDynamicCluster) {
-      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x && 
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
                                         cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
       observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
       observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
@@ -461,7 +463,7 @@ struct CollectiveMma<
     return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
   }
 
-  template<class EpilogueTile, bool IsOverlappingAccum = false>
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
   CUTLASS_DEVICE static
   auto
   init_tmem_tensors(EpilogueTile epi_tile) {
@@ -475,10 +477,10 @@ struct CollectiveMma<
     return tmem_storage;
   }
 
-  template<class AccTensor>
+  template <class TmemStorage>
   CUTLASS_DEVICE static
   void
-  set_tmem_offsets(TmemStorage<AccTensor>& tmem_storage, uint32_t tmem_base_addr) {
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
     tmem_storage.accumulators.data() = tmem_base_addr;
   }
 
@@ -535,21 +537,21 @@ struct CollectiveMma<
     // TMA Multicast Masks
     uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
     uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
-    
-    LoadParams load_params {
+
+    return LoadParams{
       shape<3>(gA_mkl),                       // for scheduler
       tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
-      mcast_mask_a, mcast_mask_b             // multicast masks
-    };
-    return load_params;
+      mcast_mask_a, mcast_mask_b};           // multicast masks
   }
 
   /// Set up the data needed by this collective for mma compute.
-  template <class AccTensor>
+  template <class TmemStorage>
   CUTLASS_DEVICE auto
   mma_init(
-      [[maybe_unused]] TmemStorage<AccTensor> tmem_tensors,
-      TensorStorage& shared_tensors) const {
+    [[maybe_unused]] TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
 
@@ -558,7 +560,7 @@ struct CollectiveMma<
     Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
 
     CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
 
     TiledMma tiled_mma;
 
@@ -568,11 +570,10 @@ struct CollectiveMma<
       tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
       tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
     }
-    MmaParams<decltype(tCrA), decltype(tCrB)> mma_params {
+
+    return MmaParams{
       tiled_mma,
-      tCrA, tCrB
-    };
-    return mma_params;
+      tCrA, tCrB};
   }
 
   /// Perform a collective-scoped matrix multiply-accumulate
@@ -657,6 +658,7 @@ struct CollectiveMma<
   ) {
     static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
     static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
     auto accumulators = get<0>(accumulators_pair);
     auto [tiled_mma, tCrA, tCrB] = mma_inputs;
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index e06ead97..546bf915 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -58,6 +58,7 @@ template <
   class ClusterShape,
   class KernelSchedule,
   int ScaleGranularityM_,
+  int ScaleGranularityN_,
   class TileShape_,
   class ElementA_,
   class StrideA_,
@@ -73,7 +74,7 @@ template <
   class SmemCopyAtomB_,
   class TransformB_>
 struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_, ScaleGranularityN_>,
     TileShape_,
     ElementA_,
     StrideA_,
@@ -92,7 +93,7 @@ struct CollectiveMma<
   //
   // Type Aliases
   //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_, ScaleGranularityN_>;
   using TileShape = TileShape_;
   using ElementA = ElementA_;
   using StrideA = StrideA_;
@@ -120,7 +121,9 @@ struct CollectiveMma<
   static constexpr int NumProducerThreadEvents = 2; 
 
   static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+  static constexpr int ScaleGranularityN = ScaleGranularityN_ == 0 ? size<1>(TileShape{}) : ScaleGranularityN_;
   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
 
   static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
   static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
@@ -131,6 +134,7 @@ struct CollectiveMma<
   static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
 
   static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+  static_assert((size<1>(TileShape{}) % ScaleGranularityN) == 0, "FP8 scaling granularity must evenly divide tile shape along N.");
 
   // Tile along modes in a way that maximizes the TMA box size.
   using SmemLayoutA = decltype(tile_to_shape(
@@ -144,12 +148,13 @@ struct CollectiveMma<
 
   // Block scaling gmem-to-smem copy atom 
   using BlockScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementBlockScale)) * ScaleMsPerTile, 16)>;
+  using BlockScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementBlockScale)) * ScaleNsPerTile, 16)>;
   using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<BlockScaleCopyTypeA>, ElementBlockScale>;
-  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<BlockScaleCopyTypeB>, ElementBlockScale>;
 
   // Block scaling smem layout
   using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
-  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+  using SmemLayoutScaleB = Layout<Shape<Int<ScaleNsPerTile>, Int<DispatchPolicy::Stages>>>;
 
   static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
   static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
@@ -168,7 +173,7 @@ struct CollectiveMma<
       cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
       cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
       cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // ScaleNsPerTile x k
     } tensors;
 
     using PipelineStorage = typename MainloopPipeline::SharedStorage;
@@ -322,17 +327,17 @@ struct CollectiveMma<
     Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
 
     // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(get<2>(gA_mkl.shape()), Int<ScaleMsPerTile>{}, get<3>(gA_mkl.shape()), get<4>(gA_mkl.shape())); // (m,ScaleMsPerTile,k,l)
-    auto scale_dA = make_stride(get<3>(gA_mkl.shape()) * Int<ScaleMsPerTile>{}, Int<1>{}, Int<ScaleMsPerTile>{}, get<2>(gA_mkl.shape()) * get<3>(gA_mkl.shape()) * Int<ScaleMsPerTile>{});
+    auto scaleA_shape = make_shape(shape<2>(gA_mkl), Int<ScaleMsPerTile>{}, shape<3>(gA_mkl), shape<4>(gA_mkl));  // (m,ScaleMsPerTile,k,l)
+    auto scaleB_shape = make_shape(shape<2>(gB_nkl), Int<ScaleNsPerTile>{}, shape<3>(gB_nkl), shape<4>(gB_nkl));  // (n,ScaleNsPerTile,k,l)
+    auto scale_dA = compact_order(scaleA_shape, Step<_2,_0,_1,_3>{});
+    auto scale_dB = compact_order(scaleB_shape, Step<_2,_0,_1,_3>{});
     auto scaleA_layout = make_layout(scaleA_shape, scale_dA);
-    auto scaleB_shape = make_shape(get<2>(gB_nkl.shape()), get<3>(gB_nkl.shape()), get<4>(gB_nkl.shape())); // (n,k,l)
-    auto scale_dB = make_stride(get<3>(gB_nkl.shape()), Int<1>{}, get<2>(gB_nkl.shape()) * get<3>(gB_nkl.shape())); 
     auto scaleB_layout = make_layout(scaleB_shape, scale_dB);
 
-    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
+    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
     // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
     Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (m,ScaleMsPerTile,k,l)
-    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,ScaleNsPerTile,k,l)
 
     return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
   }
@@ -356,13 +361,13 @@ struct CollectiveMma<
       uint32_t block_rank_in_cluster,
       TensorStorage& shared_tensors) {
     int lane_predicate = cute::elect_one_sync();
-
     // Blockscaling: Tma loads for load_input and CpAsync for load_scale
     if (lane_predicate) {
+
       Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
       Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
       Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
-      Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+      Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (ScaleNsPerTile,k)
 
       //
       // Prepare the TMA loads for A and B
@@ -388,10 +393,10 @@ struct CollectiveMma<
       Tensor mScaleB_nkl = get<3>(load_inputs);
 
       Tensor gScaleA = mScaleA_mkl(m_coord,_,_,l_coord);                                         // (1,ScaleMsPerTile,k,1)
-      Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+      Tensor gScaleB = mScaleB_nkl(n_coord,_,_,l_coord);                                         // (1,ScaleNsPerTile,k,1)
 
       TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ScaleMsPerTile>>>{}); // (1,ScaleMsPerTile,1)
-      TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+      TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ScaleNsPerTile>>>{}); // (1,ScaleNsPerTile,1)
       ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
       ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
 
@@ -446,7 +451,7 @@ struct CollectiveMma<
 
         // Copy scale tensors from global memory to shared memory
         copy(scale_copy_a, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
-        copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+        copy(scale_copy_b, tBgB_ScaleB(_,_,*k_tile_iter), tBsB_ScaleB(_,_,write_stage));
         pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
 
         ++k_tile_iter;
@@ -508,7 +513,11 @@ struct CollectiveMma<
         Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
         Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
       >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+    Tensor sScaleBViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()),
+      Layout<
+        Shape<cute::tuple_element_t<0, TileShape>, Shape<Int<ScaleGranularityN>, Int<ScaleNsPerTile>>, Int<DispatchPolicy::Stages>>,
+        Stride<_0, Stride<_0, _1>, Int<ScaleNsPerTile>>
+      >{}); // (m,(ScaleGranularityN,ScaleNsPerTile),k)
 
     //
     // Define C accumulators and A/B partitioning
@@ -531,7 +540,8 @@ struct CollectiveMma<
     TiledMma tiled_mma;
     auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
 
-    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC); // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+    Tensor tCsScaleBViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleBViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
 
     Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
     Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
@@ -557,11 +567,8 @@ struct CollectiveMma<
     PipelineState smem_pipe_release = smem_pipe_read;
 
     // Per block scale values for operand A and B
-    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
-    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
-
-    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
-    ElementBlockScale scale_b;
+    Tensor tCrScaleAViewAsC = make_tensor_like<ElementBlockScale>(tCsScaleAViewAsC(_, _, _, 0));    // (MMA,MMA_M,MMA_N)
+    Tensor tCrScaleBViewAsC = make_tensor_like<ElementBlockScale>(tCsScaleBViewAsC(_, _, _, 0));    // (MMA,MMA_M,MMA_N)
 
     // Prologue GMMAs
     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
@@ -583,21 +590,26 @@ struct CollectiveMma<
 
       int read_stage = smem_pipe_read.index();
 
-      // Load per block scale values from shared memory to registers.
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      // Load per block scale values from shared memory to registers
+      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
+      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
       }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrScaleBViewAsC.data()[0];
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
           tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
         }
       }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrScaleAViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
+          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        }
+      }
 
       warpgroup_arrive();
       // Unroll the K mode manually to set scale D to 1
@@ -609,8 +621,20 @@ struct CollectiveMma<
       }
       warpgroup_commit_batch();
 
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        accumulation.scale_if_needed(scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        accumulation.scale_if_needed(tCrScaleAViewAsC);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_if_needed(tCrScaleBViewAsC);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_if_needed(tCrScaleAViewAsC, tCrScaleBViewAsC);
+      }
 
       ++smem_pipe_read;
     }
@@ -632,21 +656,26 @@ struct CollectiveMma<
 
       int read_stage = smem_pipe_read.index();
 
-      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
+      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
+      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
       }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrScaleBViewAsC.data()[0];
         CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
           tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
         }
       }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrScaleAViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
+          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        }
+      }
 
       if (accumulation.prepare_if_needed()) {
         tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
@@ -667,8 +696,20 @@ struct CollectiveMma<
       warpgroup_wait<K_PIPE_MMAS>();
       warpgroup_fence_operand(accumulation());
 
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+        accumulation.scale_if_needed(scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        accumulation.scale_if_needed(tCrScaleAViewAsC);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_if_needed(tCrScaleBViewAsC);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_if_needed(tCrScaleAViewAsC, tCrScaleBViewAsC);
+      }
 
       pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
 
@@ -677,7 +718,19 @@ struct CollectiveMma<
       ++smem_pipe_release;
     }
 
-    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+    if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+      ElementBlockScale scale_ab = tCrScaleAViewAsC.data()[0];
+      accumulation.scale_residue_if_needed(scale_ab);
+    }
+    if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+      accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+    }
+    if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+      accumulation.scale_residue_if_needed(tCrScaleBViewAsC);
+    }
+    if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+      accumulation.scale_residue_if_needed(tCrScaleAViewAsC, tCrScaleBViewAsC);
+    }
 
     warpgroup_fence_operand(accumulation());
   }
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index 155d023d..8747f48b 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -117,7 +117,11 @@ struct KernelPtrArrayTmaWarpSpecializedPingpong { };
 
 // FP8 related policies (including Blocked Scaled Accumulation)
 template<
-  int ScaleGranularityM = 0 // `ScaleGranularityM` specifies scaling granularity along M, while zero-value `ScaleGranularityM` indicates that scaling granularity is `size<0>(TileShape_MNK{})` along M.
+  // `ScaleGranularityM`/`ScaleGranularityN` specifies scaling granularity along M/N, while zero-value
+  // `ScaleGranularityM`/`ScaleGranularityN` indicates that scaling granularity is
+  // `size<0>(TileShape_MNK{})`/`size<1>(TileShape_MNK{})` along M/N.
+  int ScaleGranularityM = 0,
+  int ScaleGranularityN = 0
 >
 struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelTmaWarpSpecializedCooperative { };
 
@@ -302,12 +306,16 @@ template<
   int Stages_,
   class ClusterShape_ = Shape<_1,_1,_1>,
   class KernelSchedule = KernelTmaWarpSpecialized,
-  int ScaleGranularityM = 0 // `ScaleGranularityM` specifies scaling granularity along M, while zero-value `ScaleGranularityM` indicates that scaling granularity is `size<0>(TileShape_MNK{})` along M.
+  // `ScaleGranularityM`/`ScaleGranularityN` specifies scaling granularity along M/N, while zero-value
+  // `ScaleGranularityM`/`ScaleGranularityN` indicates that scaling granularity is
+  // `size<0>(TileShape_MNK{})`/`size<1>(TileShape_MNK{})` along M/N.
+  int ScaleGranularityM = 0,
+  int ScaleGranularityN = 0
 >
 struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8
   : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
   static_assert(
-    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM>>,
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum<ScaleGranularityM, ScaleGranularityN>>,
     "KernelSchedule must be one of the warp specialized policies");
 };
 
diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped.h b/include/cutlass/gemm/kernel/rank_2k_grouped.h
index 84d70212..41165cfd 100644
--- a/include/cutlass/gemm/kernel/rank_2k_grouped.h
+++ b/include/cutlass/gemm/kernel/rank_2k_grouped.h
@@ -397,8 +397,6 @@ public:
       // An example of an unneeded threadblock is one that is assigned to compute in the upper
       // portion of a Rank2K kernel filled with mode kLower.
       //
-      // TODO: Consider pushing these checks into ProblemVisitor to avoid spuriously
-      // returning from `next_tile()`.
       //
 
       // Early exit if threadblock is out of range
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
index 69748c9c..65885b8a 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
@@ -1131,6 +1131,10 @@ public:
       }
     }
 
+    else {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+    }
   }
 };
 
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
index 5d03f921..95cc663b 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
@@ -29,8 +29,6 @@
  *
  **************************************************************************************************/
 
-
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -564,20 +562,21 @@ public:
     // Sync deallocation status between MMA warps of peer CTAs
     arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
     [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
-    if constexpr(!IsOverlappingAccum) {
-      if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
-        tmem_deallocation_result_barrier.init(NumMMAThreads);
+    if (WarpCategory::MMA == warp_category) {
+      if constexpr(!IsOverlappingAccum) {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumMMAThreads);
+        }
+      }
+      else {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+        }
+        else if (lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+        }
       }
     }
-    else {
-      if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
-        tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
-      }
-      else if (WarpCategory::MMA == warp_category && lane_predicate) {
-        tmem_deallocation_result_barrier.init(NumEpilogueThreads);
-      }
-    }
-
 
     // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
     arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
@@ -699,7 +698,6 @@ public:
       epilogue_throttle_barrier.arrive();
 
       if constexpr (IsSchedDynamicPersistent) {
-
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
         // why this variable is needed.
@@ -738,7 +736,6 @@ public:
           work_tile_info = next_work_tile_info;
         } while (work_tile_info.is_valid());
         clc_pipeline.producer_tail(clc_pipe_producer_state);
-
       }
     }
 
@@ -963,7 +960,6 @@ public:
           epi_load_pipe_consumer_state = load_state_next;
           epi_store_pipe_producer_state = store_state_next;
           accumulator_pipe_consumer_state = acc_state_next;
-
           do_tail_store = true;
         }
         work_tile_info = next_work_tile_info;
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
index 4e1d2930..afadb309 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
@@ -1057,6 +1057,10 @@ public:
       }
     }
 
+    else {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+    }
   }
 };
 
diff --git a/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
index f7be566f..8e503353 100644
--- a/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
+++ b/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
@@ -783,7 +783,6 @@ private:
       int L_idx, Split_idx;
       params_.sk_params_.divmod_splits_(L_idx, Split_idx, work_tile_info.L_idx);
 
-      // TODO: Modularize the SM90 scheduler to pull out and reuse this redundant code
       int additional_k_tiles = 0;
       int split_start_offset = params_.sk_params_.big_units_;
 
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index 4482e25d..cfb6912c 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -455,8 +455,9 @@ public:
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
 
     TileScheduler scheduler{params.scheduler};
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-    
+    // Declare work_tile_info, then define it in each of warps that use it.
+    typename TileScheduler::WorkTileInfo work_tile_info;
+
     // In a warp specialized kernel, collectives expose data movement and compute operations separately
     CollectiveMainloop collective_mainloop;
 
@@ -474,6 +475,7 @@ public:
     cluster_wait_fn();
 
     if (warp_group_role == WarpGroupRole::Producer) {
+      work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
       cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
 
       // Mainloop Producer Warp
@@ -578,6 +580,7 @@ public:
     } // Producer Warp Group End
 
     else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
       cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
 
       CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
index aa599a35..e1579d3f 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler_params.h
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -265,7 +265,7 @@ struct PersistentTileSchedulerSm90Params {
     }
     // In case the maximum number of clusters that could co-exist on the target device is
     // already calculated using cudaOccupancyMaxActiveClusters
-    else if (max_active_clusters != 0) {
+    else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
       if (raster_order == RasterOrder::AlongN) {
         launch_grid.y = max_active_clusters * cluster_shape.n();
       }
@@ -1204,6 +1204,7 @@ struct PersistentTileSchedulerSm90StreamKParams {
       KernelHardwareInfo new_hw_info;
       new_hw_info.device_id = hw_info.device_id;
       new_hw_info.sm_count = hw_info.sm_count;
+      new_hw_info.max_active_clusters = hw_info.max_active_clusters;
       if (new_hw_info.sm_count <= 0) {
         CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
             "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
@@ -1787,7 +1788,7 @@ struct PersistentTileSchedulerSm90GroupParams {
     }
     // In case the maximum number of clusters that could co-exist on the target device is
     // already calculated using cudaOccupancyMaxActiveClusters
-    else if (max_active_clusters != 0) {
+    else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
       if (raster_order == RasterOrder::AlongN) {
         launch_grid.y = max_active_clusters * cluster_shape.n();
       }
@@ -2499,6 +2500,7 @@ struct PersistentTileSchedulerSm100GroupParams {
     bool is_static_cluster_shape = false) {
 
     int const sm_count = hw_info.sm_count;
+    int const max_active_clusters = hw_info.max_active_clusters;
 
     // Round up to nearest multiple of swizzle_size along each mode
     auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
@@ -2542,6 +2544,18 @@ struct PersistentTileSchedulerSm100GroupParams {
           launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
         }
       }
+      // In case the maximum number of clusters that could co-exist on the target device is
+      // already calculated using cudaOccupancyMaxActiveClusters
+      else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
+        if (raster_order == RasterOrder::AlongN) {
+          launch_grid.y = max_active_clusters * cluster_shape.n();
+        }
+        else {
+          launch_grid.x = max_active_clusters * cluster_shape.m();
+        }
+        CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
+            "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+      }
       else {
         constexpr int max_sm_per_gpc = 20;
         int cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index f6cc735a..0d1da845 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -142,7 +142,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
         "Shape of warp-level Mma must be divisible by operator shape.");
 
     // Shape of one individual LDS.128
-    // TODO: 32 and 4 are hardcoded, 32-by-4 is logical shape
     using LdsShape = layout::PitchLinearShape<
       32,
       4
@@ -458,7 +457,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
         "Shape of warp-level Mma must be divisible by operator shape.");
 
     // Shape of one individual LDS
-    // TODO: remove hardcoded 32 and 4
     using LdsShape = layout::PitchLinearShape<
       32,
       4
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
index d53d6dfd..a5370ff8 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -995,7 +995,6 @@ public:
   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
 
-    // TODO: fix this if it becomes an issue during warp it reset
     add_tile_offset(tile_offset);
 
     return *this;
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index 91e4a9ef..faf64275 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -41,7 +41,6 @@
 #pragma once
 
 #include <cuda/std/cassert>
-
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/layout/pitch_linear.h"
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index b0c616a7..b44264bb 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -82,7 +82,7 @@ struct get_unpacked_element_type {
 #include "cutlass/tfloat32.h"
 #include "cutlass/float8.h"
 #include "cutlass/uint128.h"
-#include "cutlass/exmy_base.h" 
-#include "cutlass/float_subbyte.h" 
+#include "cutlass/exmy_base.h"
+#include "cutlass/float_subbyte.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/media/docs/blackwell_cluster_launch_control.md b/media/docs/blackwell_cluster_launch_control.md
index fe13b960..faebb900 100644
--- a/media/docs/blackwell_cluster_launch_control.md
+++ b/media/docs/blackwell_cluster_launch_control.md
@@ -2,9 +2,9 @@
 
 ## Overview
 
-A GEMM workload usually consists of three phases: prologue, mainloop and epilogue. Each available SM will process multiple output tiles in series if the number of output tiles are much more than the number of available SMs, completely exposing the overhead of prologue and epilogue.
+A GEMM workload usually consists of three phases: prologue, mainloop and epilogue. Each SM will process multiple output tiles in series if the number of output tiles are much more than the number of SMs, completely exposing the overhead of prologue and epilogue.
 
-Consider a GEMM that has `20x20x1` output tiles, running on a GPU with `100` SMs. Only `80` out of the `100` SMs are available. Assume cluster shape is `1x1x1`. The following diagram shows how the schedule would look like for such a kernel. 
+Consider a GEMM that has `20x20x1` output tiles, running on a GPU with `100` SMs. There is another kernel occupying all the resources of `20` SMs so only `80` SMs can be used. Assume cluster shape is `1x1x1`. The following diagram shows how the schedule would look like for such a kernel. 
 
 <p align="center"><img src=../images/non_persistent.png alt="A beautiful sunset" title="Sunset over the mountains"></p>
 
@@ -12,22 +12,22 @@ Consider a GEMM that has `20x20x1` output tiles, running on a GPU with `100` SMs
 ### Static Scheduler
 CUTLASS has adopted a software technique named **persistent kernels**. Persistent clusters, or Workers, can stay on the GPU throughout kernel execution and process multiple tiles, hiding prologue and epilogue costs. The tile scheduler statically determines the next output tile to process with zero overhead. 
 
-However, static scheduler is susceptible to workload imbalance if some SMs are unavailable. The following diagram illustrates this issue. 
+However, static scheduler is susceptible to workload imbalance if the resources of some SMs are unavailable. The following diagram illustrates this issue. 
 
 <p align="center"><img src=../images/persistent_static.png alt="A beautiful sunset" title="Sunset over the mountains"></p>
 
 ### Dynamic Scheduler with Cluster Launch Control
-A fundamental limitation of persistent scheduling is that the kernel is unaware of the number of available SMs in real time. Some SMs might be occupied by another kernel and thus be unavailable. This makes it challenging to load-balance work across available SMs.
+A fundamental limitation of persistent scheduling is that the number of SMs this kernel can utilize is unknown in real time. Some SMs might be occupied by another kernel and thus their resources are unavailable. This makes it challenging to load-balance work across SMs.
 
 Blackwell introduces cluster launch control (CLC) for dynamic scheduling. (See https://docs.nvidia.com/cuda/parallel-thread-execution).  With this feature, the kernel launches a grid containing as many threadblocks as there are output tiles to compute in the kernel -- just like one would in a non-persistent kernel. Here we define `ClcID` to be a coordinate from the 3D grid launched on GPU.
 
 Cluster launch control follows the below rules:
 
-1. A `ClcID` will be launched as a Worker when there are available SMs.
+1. A `ClcID` will be launched as a Worker when there are available resources.
 2. A `ClcID` can be queried by an existing Worker via `clusterlaunchcontrol.try_cancel` instruction.
 3. Every `ClcID` is guaranteed to be processed by either (1) or (2).
-4. Each Worker is pre-loaded with a `ClcID`, which is the coordinate indicated by `{blockIdx.x, blockIdx.y, blockIdx.z}`.
-5. `clusterlaunchcontrol.try_cancel` instruction returns either a success signal with a `ClcID` or a decline signal. The most common reason of a decline is that akk `ClcID`s have been processed.
+4. Each worker uses the `{blockIdx.x, blockIdx.y, blockIdx.z}` coordinate as the first output tile to process and uses the CLC query for subsequent processing of output tiles.
+5. `clusterlaunchcontrol.try_cancel` instruction returns either a success signal with a `ClcID` or a decline signal. The most common reason of a decline is that all `ClcID`s have been processed.
 6. Cluster launch control works on the granularity of clusters. For example, a 2x2 persistent worker cluster's query will consume 2x2 `ClcID`s at once.
 
 The following diagram shows how the schedule would look like with cluster launch control.
diff --git a/media/docs/blackwell_functionality.md b/media/docs/blackwell_functionality.md
index a7c6169f..02488a3b 100644
--- a/media/docs/blackwell_functionality.md
+++ b/media/docs/blackwell_functionality.md
@@ -285,7 +285,9 @@ Layout, and Dispatch Policy combinations for each row of [Table 1](#legacy_gemm_
 | 1/2 SM | Epilogue Dispatch Policy                 |
 |--------|------------------------------------------|
 | 1SM    | cutlass::epilogue::TmaWarpSpecialized1Sm |
+| 1SM    | cutlass::epilogue::NoSmemWarpSpecialized1Sm |
 | 2SM    | cutlass::epilogue::TmaWarpSpecialized2Sm |
+| 2SM    | cutlass::epilogue::NoSmemWarpSpecialized2Sm |
 
 **Table 15: Epilogue PerSmTileShape_MNK** <a id="epi_persmtileshape" name="epi_persmtileshape"></a> 
 | 1/2 SM | MMA tile Shape           | PerSmTileShape_MNK      |
@@ -442,7 +444,7 @@ PerSmTileShape_MNK should be deduced from the mainloop setup. For example, in ab
 It means each CTA is doing (256 / 2sm) x 256 x 128 output, so the PerSmTileShape_MNK is 128x256x128. The possible PerSmTileShape_MNK
 is listed in [Table 15](#epi_persmtileshape)
 
-The epilogue scheduling policy is configurable, and it is common to set `cutlass::epilogue::TmaWarpSpecialized2Sm`
+The epilogue scheduling policy is configurable, and it is common to set `cutlass::epilogue::collective::EpilogueScheduleAuto`
 to allow the epilogue builder to automatically select the appropriate policy. However, it can also be explicitly defined to
 use other policies based on the 1sm or 2sm MMA instruction. The available policies are listed in [Table 14](#epi_dispatch).
 
@@ -458,10 +460,6 @@ use other policies based on the 1sm or 2sm MMA instruction. The available polici
   using ElementAccumulator = float;
   // Epilogue computation's precision type
   using ElementCompute = float;
-  // Cluster size for multicast
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
   
   //
   // Construct CollectiveEpilogue
@@ -469,7 +467,7 @@ use other policies based on the 1sm or 2sm MMA instruction. The available polici
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // MMA tile shape, and cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -499,12 +497,12 @@ Typically, GmemLayoutSFD would be same as the GmemLayoutD.
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // MMA tile shape, and cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
       FusionOperation                                                       // <================================== Pass the fusion config into epilogue builder.
     >::CollectiveOp;
 ```
diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md
index e50cd384..3bfc4453 100644
--- a/media/docs/fundamental_types.md
+++ b/media/docs/fundamental_types.md
@@ -32,8 +32,8 @@ CUTLASS defines classes for the following numeric data types.
 * `type_erased_dynamic_float4_t`: Type agnostic 4 bits signed float allowing the user to provide a specific datatype as runtime argument.
 * `mx_float8_t<float_e5m2_t>` or `mx_float8_t<float_e4m3_t>` : Block scaled data type with fp8 element type and float_ue8m0_t scale factor and vector size of 32.
 * `mx_float6_t<float_e3m2_t>` or `mx_float6_t<float_e2m3_t>` : Block scaled data type with fp6 element type and float_ue8m0_t scale factor and vector size of 32.
-* `mx_float6_t<float_e2m1_t>` : Block scaled data type with signed e2m1 element type and float_ue8m0_t scale factor and vector size of 32.
-* `nv_float4_t<float_e2m1_t>` : Block scaled data type with signed e2m1 element type and float_ue8m0_t scale factor and vector size of 16.
+* `mx_float4_t<float_e2m1_t>` : Block scaled data type with signed e2m1 element type and float_ue8m0_t scale factor and vector size of 32.
+* `nv_float4_t<float_e2m1_t>` : Block scaled data type with signed e2m1 element type and float_ue4m3_t scale factor and vector size of 16.
 * `complex<T>`: defines complex-valued data type based on the supplied real-valued numeric type
 
 Numeric types in CUTLASS may be used in both host and device code and are intended to function
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 736344b4..057fd2d8 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -308,6 +308,9 @@ GEMM
   [int]       --cluster_m,--cluster-shape::m                    Cluster shape in the M dimension
   [int]       --cluster_n,--cluster-shape::n                    Cluster shape in the N dimension
   [int]       --cluster_k,--cluster-shape::k                    Cluster shape in the K dimension
+  [int]       --cluster_m_fallback,--cluster-shape-fallback::m  Fallback cluster shape in the M dimension
+  [int]       --cluster_n_fallback,--cluster-shape-fallback::n  Fallback cluster shape in the N dimension
+  [int]       --cluster_k_fallback,--cluster-shape-fallback::k  Fallback cluster shape in the K dimension
   [int]       --stages,--threadblock-stages                     Number of stages of threadblock-scoped matrix multiply
   [int]       --warps_m,--warp-count::m                         Number of warps within threadblock along the M dimension
   [int]       --warps_n,--warp-count::n                         Number of warps within threadblock along the N dimension
@@ -320,6 +323,7 @@ GEMM
   [enum]      --raster_order={heuristic|H|along_m|M|along_n|N}  If supported by kernel, sets the tile raster direction
   [int]       --swizzle_size={1,2,4,8}                          If supported by kernel, sets the 2D tile swizzle extent (In Hopper, other values will be rounded down to the nearest supported value)
   [int]       --use_pdl,--use-pdl                               Use PDL (true, false)
+  [int]       --enable_sm90_mixed_dtype_shuffle_test            If true, the profiler will test SM90 mixed input kernels that can use shuffled input layouts for better performance
   [enum]      --runtime_input_datatype_a                        Runtime data type for A matrix, narrow-precision only (e4m3, e5m2, e3m2, e2m3, e2m1)
   [enum]      --runtime_input_datatype_b                        Runtime data type for B matrix, narrow-precision only (e4m3, e5m2, e3m2, e2m3, e2m1)
 
@@ -360,11 +364,12 @@ Profile when execution is performed on device 0 and the C tensor is located on a
   $ cutlass_profiler --device=0 --allocations=C:1,D:2 --operation=Gemm --m=1024 --n=1024 --k=128
 ```
 
-The format of tensor argument is followed by `<type>:<layout>`. The type could be `f32` as 32-bit floating point, `s8` as 8-bit signed integer, etc. The available types can be referred to the `NumericTypeID_enumerants` in [util.cu](tools/library/src/util.cu). The layout could be `row` or `column`.
+The format of tensor argument is followed by `<type>:<layout>`. The type could be `f32` as 32-bit floating point, `s8` as 8-bit signed integer, etc. The available types can be referred to the `NumericTypeID_enumerants` in [util.cu](tools/library/src/util.cu). The layout could be `row` or `column`. If `--enable_sm90_mixed_dtype_shuffle_test=true` is used, the actual layout of the narrow data type matrix is a shuffled layout, neither `row` nor `column`.
 
 In addition to encoded data types, CUTLASS profiler allows non-encoded generic data types, namely `f8`, `f6`, and `f4`, with corresponding encoding specified through GEMM input argument: `--runtime_input_datatype_a` and `--runtime_input_datatype_b`. Currently, six encoding schemes are supported: `e4m3`, `e5m2`, `e3m2`, `e2m3`, and `e2m1`.
 
-Cluster shapes can be statically set to `Shape<int,int,_1>;` and specified via runtime arguments: `cluster_m`, `cluster_n` and `cluster_k` in CUTLASS profiler. One may refer to our CUTLASS Example [73_blackwell_gemm_flexible_cluster](../../examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for more details of the this feature.
+Cluster shapes can be statically set to `Shape<int,int,_1>;` and specified via runtime arguments: `cluster_m`, `cluster_n` and `cluster_k` in CUTLASS profiler.  In addition to preferred cluster shapes, a user can also specify fallback cluster shapes via runtime arguments: `cluster_m_fallback`, `cluster_n_fallback` and `cluster_k_fallback` in CUTLASS profiler. Those fallback cluster shapes are smaller shapes than the preferred ones for the hardware to assign when there is no chance to issue a larger preferred CGA cluster to the GPU. There are several rules for using a flexible CGA: 1) Preferred CGA size should be divisible by fallback CGA size. 2) Grid dim should be divisible by preferred CGA size. 3) Preferred CGA and fallback CGA must have the same depth (cluster_dim.z must be equal). One may refer to our CUTLASS Example [73_blackwell_gemm_flexible_cluster](../../examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for more details of the this feature. 
+Please be noted that this feature (flexible cluster shapes within a single grid) is only applicable to `sm100a` kernels. The hardware will rasterize into a single cluster shape for those kernels that do not support this feature even with preferred or fallback cluster shapes assigned.
 
 CUTLASS 3.x kernels for Hopper and Blackwell also support a new feature called programatic dependent launch (PDL). This can be enabled with `--use-pdl`, and can overlap the epilogue of the prior kernel with the prologue of the next kernel. This can effectively hide kernel prologues. Using PDL can improve performance for back to back GEMMs. See [dependent kernel launch](dependent_kernel_launch.md) for more information. CUDA graphs can also be used (`--use-cuda-graphs`) with PDL to ensure that smaller kernels are enqueued back-to-back on a stream.
 
@@ -585,6 +590,9 @@ Conv2d
   [int]       --cluster_m,--cluster-shape::m                    Cluster shape in the M dimension
   [int]       --cluster_n,--cluster-shape::n                    Cluster shape in the N dimension
   [int]       --cluster_k,--cluster-shape::k                    Cluster shape in the K dimension
+  [int]       --cluster_m_fallback,--cluster-shape-fallback::m  Fallback cluster shape in the M dimension
+  [int]       --cluster_n_fallback,--cluster-shape-fallback::n  Fallback cluster shape in the N dimension
+  [int]       --cluster_k_fallback,--cluster-shape-fallback::k  Fallback cluster shape in the K dimension
   [int]       --stages,--threadblock-stages                     Number of stages of threadblock-scoped matrix multiply
   [int]       --warps_m,--warp-count::m                         Number of warps within threadblock along the M dimension
   [int]       --warps_n,--warp-count::n                         Number of warps within threadblock along the N dimension
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index dd1b0c6f..cfcd5df1 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -672,11 +672,8 @@ The kernel starts with setting up datatypes and cluster shapes.
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
-  using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+  using MmaTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = cute::Shape<_1,_1,_1>;
 ```
 
 The epilogue needs to be instantiated first as the mainloop collective builder takes the shared memory budget of epilogue in the template parameter list. The 3.x epilogue collective builder API has not changed
@@ -688,13 +685,12 @@ for Blackwell, so the epilogue fusion is built in a same way as an SM90 epilogue
   using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
     ElementD,
     ElementCompute,
-    ElementC,
-    ElementBias
+    ElementC
   >;
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -728,8 +724,6 @@ dispatch policies can be in [blackwell_functionality.md](./blackwell_functionali
   >;
 ```
 
-It is worth noting that the mainloop builder takes `MmaTileShape` while the epilogue builder takes `OutputCtaShape`.
-
 Instantiating a blockscaled GEMM kernel is slightly different. Referring to an [MXFP8 GEMM](./../../test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu) sample unit test, it takes a different tensor operation class:
  
 ```c++
@@ -742,10 +736,10 @@ are needed in the mainloop builder:
 ```c++
   using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      ElementA, GmemLayoutA, 16,
-      ElementB, GmemLayoutB, 16,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
       ElementAccumulator,
-      MmaTileShape_MNK, ClusterShape_MNK,
+      MmaTileShape, ClusterShape,
       cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
       cutlass::gemm::KernelScheduleAuto
     >::CollectiveOp;
diff --git a/python/cutlass/backend/c_types.py b/python/cutlass/backend/c_types.py
index 83a80e81..1e8f5774 100644
--- a/python/cutlass/backend/c_types.py
+++ b/python/cutlass/backend/c_types.py
@@ -532,29 +532,12 @@ def tuple_factory_(input_tuple, dtype, constants=[0,1]):
                 if first_non_empty_base is None:
                     first_non_empty_base = []
 
-    # Determine whether or not add an additional byte for empty base classes
-    additional_byte = False
-    # Special case for constant tuple
-    if first_non_empty_base is None:
-        additional_byte = False
-    else:
-        for base in first_non_empty_base:
-            if base in empty_bases:
-                additional_byte = True
-                break
-
-    if additional_byte:
-        ctype_fields = [("empty_byte", EmptyByte), ] + ctype_fields
-
     # Create the ctype tuple
     class TupleType(ctypes.Structure):
         _fields_ = ctype_fields
 
         def __init__(self, args) -> None:
-            if additional_byte:
-                fields = self._fields_[1:]
-            else:
-                fields = self._fields_
+            fields = self._fields_
 
             assert len(fields) == len(args)
             for field, arg in zip(fields, args):
diff --git a/python/cutlass_library/conv3x_emitter.py b/python/cutlass_library/conv3x_emitter.py
index 46cb56d0..459df607 100644
--- a/python/cutlass_library/conv3x_emitter.py
+++ b/python/cutlass_library/conv3x_emitter.py
@@ -69,7 +69,7 @@ using ${operation_name}_epilogue =
   typename cutlass::epilogue::collective::CollectiveBuilder<
     ${arch},
     ${opcode_class_epi},
-    ${output_cta_tile_shape},        // output cta tile shape
+    ${mma_tile_shape},               // mma tile shape
     ${cluster_shape},                // cluster shape
     ${epi_tile_mn},
     ${element_accumulator},
@@ -109,26 +109,6 @@ using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
   def arch_number_to_type(self, arch: int) -> str:
     return f"cutlass::arch::Sm{arch}"
 
-  def output_cta_tile_shape(self, operation, cta_m, cta_n, cta_k) -> str:
-    # For all three kinds of convolutions, the tile shape's K mode
-    # differs from GEMM in that needs to be wrapped in a Shape.
-    # For Wgrad convolutions specifically,
-    # the N tile shape also needs to be wrapped in a Shape.
-    m_template = 'cute::_${cta_m}'
-    if operation.conv_kind == ConvKind.Wgrad:
-      n_template = 'cute::Shape<cute::_${cta_n}>'
-    else:
-      n_template = 'cute::_${cta_n}'
-    k_template = 'cute::Shape<cute::_${cta_k}>'
-
-    output_cta_tile_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
-    values = {
-      'cta_m': cta_m,
-      'cta_n': cta_n,
-      'cta_k': cta_k
-    }
-    return Template(output_cta_tile_shape_template).substitute(values)
-
   def mma_tile_shape(self, operation, cta_m, cta_n, cta_k) -> str:
     mma_m = cta_m
     mma_n = cta_n
@@ -223,7 +203,6 @@ using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
       'element_accumulator':   DataTypeTag[operation.accumulator_type()],
       'opcode_class':          opcode_class,
       'arch':                  self.arch_number_to_type(operation.arch),
-      'output_cta_tile_shape': self.output_cta_tile_shape(operation, cta_m, cta_n, cta_k),
       'mma_tile_shape':        self.mma_tile_shape(operation, cta_m, cta_n, cta_k),
       'cluster_shape':         self.cluster_shape(operation),
       'opcode_class_epi':      opcode_class_epi,
diff --git a/python/cutlass_library/emit_kernel_listing.py b/python/cutlass_library/emit_kernel_listing.py
index 96733d60..52598d73 100755
--- a/python/cutlass_library/emit_kernel_listing.py
+++ b/python/cutlass_library/emit_kernel_listing.py
@@ -90,19 +90,32 @@ def hash_cutlass_string(input_string):
 def transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b):
   # Define a dictionary mapping the detected types to runtime values
   datatype_map = {
-    '_f4_': '_' + runtime_datatype_a + '_',
-    '_f6_': '_' + runtime_datatype_b + '_',
-    '_f8_': '_' + runtime_datatype_a + '_',
+    'f4_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f4_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f4_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue4m3xf4_ue4m3xf4': 'ue4m3x' + runtime_datatype_a + '_ue4m3x' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf8': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf6_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf6_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf8': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
   }
 
-  # Use regex to identify and replace _f4_, _f6_, or _f8_ in the kernel name
-  def substitute(match):
-    datatype = match.group(0)  # This is the matched "_f4_", "_f6_", or "_f8_"
-    return datatype_map.get(datatype, datatype)  # Replace or leave as is
+  # Regular expression to detect all the keys in datatype_map
+  pattern = re.compile(r'(' + '|'.join(map(re.escape, datatype_map.keys())) + r')')
+
+  # Replace detected patterns using the dictionary
+  updated_kernel_name = pattern.sub(lambda match: datatype_map[match.group(0)], hashed_kernel_name)
 
-  # Regex to find "_f4_", "_f6_", or "_f8_" in the hashed_kernel_name
-  updated_kernel_name = re.sub(r'_f4_|_f6_|_f8_', substitute, hashed_kernel_name)
-    
   return updated_kernel_name
 
 # This helper function reports foundational kernel features: datatypes, layouts, alignment and stream-k.
diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py
index 5cc4f8b4..2374a131 100644
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -64,17 +64,15 @@ class GemmOperation:
   def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
       epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
       kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto,
-      tile_scheduler = TileSchedulerType.Default, mixed_input_mode = None, mixed_input_shuffle = False
-      
-      , ScaleFactorA = None, ScaleFactorB = None, ScaleFactorD = None
-      
-    ):
+      tile_scheduler = TileSchedulerType.Default, mixed_input_mode = None, mixed_input_shuffle = False,
+      ScaleFactorA = None, ScaleFactorB = None, ScaleFactorD = None):
 
     kinds_3x = {
       GemmKind.Universal3x,
       GemmKind.SparseUniversal3x,
       GemmKind.BlockScaledUniversal3x, 
-      GemmKind.GroupedGemmUniversal3x,
+      GemmKind.GroupedUniversal3x,
+      GemmKind.GroupedBlockScaledUniversal3x,
     }
     self.is_3x = gemm_kind in kinds_3x
     self.prefix = "3x" if self.is_3x else ""
@@ -87,13 +85,11 @@ class GemmOperation:
     self.C = C
     self.D = D
 
-    
-    if self.gemm_kind == GemmKind.BlockScaledUniversal3x:
+    if is_block_scaled(gemm_kind):
       self.ScaleFactorA = ScaleFactorA
       self.ScaleFactorB = ScaleFactorB
       self.ScaleFactorD = ScaleFactorD["tensor"]
       self.ScaleFactorVectorSize = ScaleFactorD["vector_size"]
-    
 
     if self.D == None:
       self.D = self.C
@@ -239,13 +235,13 @@ class GemmOperation:
       element_c = DataTypeNames[self.C.element],
       element_d = DataTypeNames[self.D.element],
       core_name = self.core_name())
-    
-    if self.gemm_kind == GemmKind.BlockScaledUniversal3x:
+
+    if is_block_scaled(self.gemm_kind):
       d_type_names = DataTypeNames[self.D.element]
-      
+
       if self.ScaleFactorD.element != DataType.void:
         d_type_names = DataTypeNames[self.ScaleFactorD.element] + "x" + d_type_names
-      
+
       extended_name = "{core_name}_{element_sfa}x{element_a}_{element_sfb}x{element_b}_{element_acc}_{element_c}_{element_d}".format(
         element_sfa = DataTypeNames[self.ScaleFactorA],
         element_a = DataTypeNames[self.A.element],
@@ -255,7 +251,7 @@ class GemmOperation:
         element_c = DataTypeNames[self.C.element],
         element_d = d_type_names,
         core_name = self.core_name())
-    
+
     if self.mixed_input_mode != None:
       extended_name = extended_name + self.mixed_input_mode_name()
     return extended_name
@@ -298,8 +294,8 @@ class GemmOperation:
 
   # Generates a short string representing underlying epilogue schedule type
   def epilogue_schedule_name_3x(self):
-    
-    if self.gemm_kind == GemmKind.BlockScaledUniversal3x:
+
+    if is_block_scaled(self.gemm_kind):
       if self.ScaleFactorD.element != DataType.void:
         return EpilogueScheduleSuffixes[self.epilogue_schedule] + "_epiVs" + str(self.ScaleFactorVectorSize)+ShortLayoutTypeNames[self.ScaleFactorD.layout]
     
@@ -779,7 +775,7 @@ class EmitGemmUniversal3xInstance:
 using ${operation_name}_epilogue =
   typename cutlass::epilogue::collective::CollectiveBuilder<
     ${arch}, ${opcode_class_epi},
-    cute::Shape<cute::_${tile_shape_epi_m}, cute::_${tile_shape_epi_n}, cute::_${tile_shape_epi_k}>,
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
     cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
     ${epi_tile_mn},
     ${element_accumulator}, ${element_epilogue},
@@ -797,7 +793,7 @@ using ${operation_name}_mainloop =
     ${element_a}, ${layout_a}, ${align_a},
     ${element_b}, ${layout_b}, ${align_b},
     ${element_accumulator},
-    cute::Shape<cute::_${tile_shape_main_m}, cute::_${tile_shape_main_n}, cute::_${tile_shape_main_k}>,
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
     cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
     ${stages},
     ${kernel_schedule}
@@ -855,7 +851,7 @@ ${compile_guard_end}
 
   @staticmethod
   def pointerize_if_grouped(operation, layout):
-    return layout if operation.gemm_kind  != GemmKind.GroupedGemmUniversal3x else layout + "* "
+    return layout if not is_grouped(operation.gemm_kind) else layout + "* "
 
   @staticmethod
   def problem_shape(operation):
@@ -863,7 +859,7 @@ ${compile_guard_end}
     grouped_gemm_shape_type = "cute::Shape<int,int,int>"
     grouped_gemm_shape_type = "cutlass::gemm::GroupProblemShape<" + grouped_gemm_shape_type + ">"
 
-    return gemm_shape_type if operation.gemm_kind != GemmKind.GroupedGemmUniversal3x else grouped_gemm_shape_type
+    return gemm_shape_type if not is_grouped(operation.gemm_kind) else grouped_gemm_shape_type
 
   def emit(self, operation):
     _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
@@ -874,18 +870,12 @@ ${compile_guard_end}
     opcode_class_main = operation.tile_description.math_instruction.opcode_class
     opcode_class_epi = opcode_class_main
     
-    if opcode_class_main == OpcodeClass.BlockScaledTensorOp:
-      if operation.epilogue_schedule != EpilogueScheduleType.NoSmemWarpSpecialized:
-        opcode_class_epi = OpcodeClass.TensorOp
-    
-
     tile_shape = operation.tile_description.tile_shape
     instruction_shape = operation.tile_description.math_instruction.instruction_shape
     cluster_m = operation.tile_description.cluster_shape[0]
     cluster_n = operation.tile_description.cluster_shape[1]
 
-    tile_shape_main_m, tile_shape_main_n, tile_shape_main_k = tile_shape
-    tile_shape_epi_m, tile_shape_epi_n, tile_shape_epi_k = tile_shape
+    tile_shape_m, tile_shape_n, tile_shape_k = tile_shape
 
     # account for static/dynamic cluster shapes
     cta_m = tile_shape[0] // cluster_m if cluster_m > 0 else tile_shape[0]
@@ -902,10 +892,8 @@ ${compile_guard_end}
       if opcode_class_main in [OpcodeClass.TensorOp 
                                , OpcodeClass.BlockScaledTensorOp 
                               ]:
-        tile_shape_main_m = instruction_shape[0]
-        tile_shape_main_n = instruction_shape[1]
-      tile_shape_epi_m = cta_m
-      tile_shape_epi_n = cta_n
+        tile_shape_m = instruction_shape[0]
+        tile_shape_n = instruction_shape[1]
     
  
     # stage count set to zero indicates builder automatic stage selection
@@ -930,35 +918,36 @@ ${compile_guard_end}
       }
       epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
       
-      if operation.gemm_kind == GemmKind.BlockScaledUniversal3x and operation.ScaleFactorD.element != DataType.void:
+      if is_block_scaled(operation.gemm_kind) and operation.ScaleFactorD.element != DataType.void:
         epilogue_functor =  self.emit_block_scale_epilogue_functor(operation)
 
-      
+
     else:
       epilogue_functor = self.epilogue_functor.emit_declaration()
-      
-      if operation.gemm_kind == GemmKind.BlockScaledUniversal3x and operation.ScaleFactorD.element != DataType.void:
+
+      if is_block_scaled(operation.gemm_kind) and operation.ScaleFactorD.element != DataType.void:
         epilogue_functor =  self.emit_block_scale_epilogue_functor(operation)
-      
+
     #
     # Cutlass3x complex kernels' ElementA(B) is a tuple in collective mainloop builder, e.g. cute::tuple<Element, Transform>, Transform : cute::identity / cute::conjugate.
     element_a = DataTypeTag[operation.A.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.A.element])},{str(ComplexTransformTag3x[operation.A.complex_transform])}>"
     element_b = DataTypeTag[operation.B.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.B.element])},{str(ComplexTransformTag3x[operation.B.complex_transform])}>"
     epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
-    is_no_smem_epilogue = operation.epilogue_schedule == EpilogueScheduleType.NoSmemWarpSpecialized
     
     if opcode_class_main == OpcodeClass.BlockScaledTensorOp:
-      if cta_n == 256 and operation.kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100:
+      is_no_smem_epilogue = operation.epilogue_schedule in [EpilogueScheduleType.NoSmemWarpSpecialized1Sm, EpilogueScheduleType.NoSmemWarpSpecialized2Sm]
+      grouped = is_grouped(operation.gemm_kind)
+      if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped):
         epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
         if not is_no_smem_epilogue:
-          epilogue_schedule_type = EpilogueScheduleTag[EpilogueScheduleType.TmaWarpSpecialized1Sm]
-      if cta_n == 256 and operation.kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100:
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]
+      if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped):
         epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
         if not is_no_smem_epilogue:
-          epilogue_schedule_type = EpilogueScheduleTag[EpilogueScheduleType.TmaWarpSpecialized2Sm]
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]
       element_a = f'cute::tuple<{str(element_a)},{str(DataTypeTag[operation.ScaleFactorA])}>'
       element_b = f'cute::tuple<{str(element_b)},{str(DataTypeTag[operation.ScaleFactorB])}>'
-    
+
 
     operation_name_str = operation.procedural_name()
     layout_a_str = LayoutTag[instance_layout_A]
@@ -1041,12 +1030,9 @@ using {operation_name_str}_LayoutNarrowReordered = decltype(cute::tile_to_shape(
       'opcode_class_main': OpcodeClassTag[opcode_class_main],
       'opcode_class_epi': OpcodeClassTag[opcode_class_epi],
       'arch': "cutlass::arch::Sm%d" % operation.arch,
-      'tile_shape_epi_m': str(tile_shape_epi_m),
-      'tile_shape_epi_n': str(tile_shape_epi_n),
-      'tile_shape_epi_k': str(tile_shape_epi_k),
-      'tile_shape_main_m': str(tile_shape_main_m),
-      'tile_shape_main_n': str(tile_shape_main_n),
-      'tile_shape_main_k': str(tile_shape_main_k),
+      'tile_shape_m': str(tile_shape_m),
+      'tile_shape_n': str(tile_shape_n),
+      'tile_shape_k': str(tile_shape_k),
       'cluster_shape_m': 'cute::_' + str(operation.tile_description.cluster_shape[0]) if operation.tile_description.cluster_shape[0] > 0 else "int",
       'cluster_shape_n': 'cute::_' + str(operation.tile_description.cluster_shape[1]) if operation.tile_description.cluster_shape[1] > 0 else "int",
       'cluster_shape_k': 'cute::_' + str(operation.tile_description.cluster_shape[2]) if operation.tile_description.cluster_shape[2] > 0 else "int",
@@ -1396,7 +1382,8 @@ class EmitGemmConfigurationLibrary:
       GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance,
       GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance,
       GemmKind.Grouped: EmitGemmGroupedInstance,
-      GemmKind.GroupedGemmUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedBlockScaledUniversal3x: EmitGemmUniversal3xInstance,
     }
 
     self.gemm_kind_wrappers = {
@@ -1409,7 +1396,8 @@ class EmitGemmConfigurationLibrary:
       GemmKind.PlanarComplex: 'GemmPlanarComplexOperation',
       GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation',
       GemmKind.Grouped: 'GemmGroupedOperation',
-      GemmKind.GroupedGemmUniversal3x: 'GroupedGemmUniversal3xOperation'
+      GemmKind.GroupedUniversal3x: 'GroupedGemmUniversal3xOperation',
+      GemmKind.GroupedBlockScaledUniversal3x: 'GroupedBlockScaledGemmUniversal3xOperation',
     }
 
     self.wmma_guard_start = "#if defined(CUTLASS_ARCH_WMMA_SM${sm_number}_ENABLED)"
diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index a4cf5f90..d70f9ee8 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -217,8 +217,7 @@ def CreateGemmUniversal3xOperator(
       gemm_op_extra_args["ScaleFactorB"] = data_type["sf_type"]
       gemm_op_extra_args["ScaleFactorD"] = { "tensor": TensorDescription(data_type["sfd_type"]["type"], data_type["sfd_type"]["layout"]),
                                              "vector_size" : data_type["sfd_type"]["vector_size"]}
-      gemm_kind = GemmKind.BlockScaledUniversal3x
-    
+      assert is_block_scaled(gemm_kind)
 
     A_dtype = data_type["a_type"]
     B_dtype = data_type["b_type"]
@@ -254,9 +253,6 @@ def CreateGemmUniversal3xOperator(
 
   return operations
 
-def is_grouped(gemm_kind):
-  return gemm_kind == GemmKind.GroupedGemmUniversal3x
-
 # Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
 def CreateSparseGemmUniversal3xOperator(
     manifest, layouts, tile_descriptions, data_types,
@@ -6654,11 +6650,13 @@ def get_tma_alignment_elt(data_type : DataType, is_f8f6f4 : bool = True ) -> int
 
 sm100_cluster_shape_1sm = [
   [4,4,1]
+  , DynamicClusterShape
 ]
 
 sm100_cluster_shape_2sm = [
   # cluster_m % 2 == 0 for 2sm
   [4,4,1]
+  , DynamicClusterShape
 ]
 
 def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
@@ -6718,6 +6716,7 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
   ]
 
   cluster_shapes_1sm = [[1,2,1], [1,1,1], [1,4,1], [4,4,1]
+                        , DynamicClusterShape
                        ]
 
   tile_schedulers = [
@@ -6765,6 +6764,7 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
   ]
 
   cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]
+                        , DynamicClusterShape
                        ]
 
   for math_inst in math_instructions_2sm:
@@ -7517,8 +7517,227 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
       CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
       [[kernel_schedule, epi_schedule]], tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
 
+def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
+  # SM100 MMA with mixed F4/F6/F8 inputs + without block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
 
-def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version):
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  instruction_sizes_1sm = [
+    # [64,  128, 32],
+    [128, 128, 32],
+    # [64,  256, 32],
+    [128, 256, 32],
+  ]
+
+  instruction_sizes_2sm = [
+    # [128, 128, 32],
+    # [128, 256, 32],
+    [256, 128, 32],
+    [256, 256, 32],
+  ]
+
+  ab_types  = [
+    DataType.f4, DataType.f6, DataType.f8,
+    DataType.e2m1, DataType.e3m2, DataType.e4m3,
+  ]
+
+  acc_types = [ DataType.f32 ]
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  min_cc = 100
+  max_cc = 130
+
+  epi_type = DataType.f32
+
+  math_instructions_1sm = []
+
+  is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+
+  # Usage:
+
+  for instr_size, a_type, b_type, acc_type in product(instruction_sizes_1sm, ab_types, ab_types, acc_types):
+    is_runtime_datatype_a = is_runtime_datatype(a_type)
+    is_runtime_datatype_b = is_runtime_datatype(b_type)
+
+    # A/B datatypes should be both static or dynamic
+    if (is_runtime_datatype_a != is_runtime_datatype_b):
+      continue
+
+    math_instructions_1sm.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.TensorOp,
+        MathOperation.multiply_add)
+    )
+
+  math_instructions_2sm = []
+
+  for instr_size, a_type, b_type, acc_type in product(instruction_sizes_2sm, ab_types, ab_types, acc_types):
+    is_runtime_datatype_a = is_runtime_datatype(a_type)
+    is_runtime_datatype_b = is_runtime_datatype(b_type)
+
+    # A/B datatypes should be both static or dynamic
+    if (is_runtime_datatype_a != is_runtime_datatype_b):
+      continue
+
+    math_instructions_2sm.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.TensorOp,
+        MathOperation.multiply_add)
+    )
+
+  cluster_shapes_1sm = [
+    # [1,2,1],
+    [2,1,1],
+    [1,1,1],
+    # [1,4,1],
+    [4,4,1]
+    , DynamicClusterShape
+    ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }
+      ]
+
+    for kernel_data_type in kernel_data_types:
+      # Filter out some kernel
+      if ( kernel_data_type["a_type"] == DataType.e4m3 ) and ( kernel_data_type["b_type"] == DataType.e4m3 ) and\
+         ( kernel_data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"])
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]], tile_schedulers=tile_schedulers)
+
+  cluster_shapes_2sm = [
+    [2,1,1],
+    # [2,2,1],
+    # [2,4,1],
+    # [4,1,1],
+    # [4,2,1],
+    [4,4,1]
+    , DynamicClusterShape
+  ]
+
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }
+      ]
+
+    for kernel_data_type in kernel_data_types:
+      # Filter some kernel
+      if ( kernel_data_type["a_type"] == DataType.e4m3 ) and ( kernel_data_type["b_type"] == DataType.e4m3 ) and\
+         ( kernel_data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"])
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      if math_inst.instruction_shape[0] == 128:
+        CreateGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+          [[KernelScheduleType.TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]], tile_schedulers=tile_schedulers)
+      else:
+        CreateGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+          [[KernelScheduleType.TmaWarpSpecialized2SmSm100, EpilogueScheduleType.ScheduleAuto]], tile_schedulers=tile_schedulers)
+
+def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.BlockScaledUniversal3x):
   # SM100 MMA with mixed F4/F6/F8 inputs + block scale
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
     return
@@ -7529,7 +7748,7 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
   ]
 
   instruction_sizes_1sm = [
-    [128, 128, 32], [128, 256, 32], # Mixed F4/F6/F8 block scaled only supports M=128 for 1SM cases
+    [128, 128, 32], [128, 256, 32], # Block scaled kernels only support M=128 for 1SM cases
   ]
 
   instruction_sizes_2sm = [
@@ -7670,8 +7889,7 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
     for data_type in data_types:
       CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
         [[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]]
-        , tile_schedulers = tile_schedulers(data_type["sfd_type"])
-        )
+        , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
 
   cluster_shapes_2sm = [
     [2,1,1],
@@ -7766,21 +7984,21 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
           if math_inst.instruction_shape[0] == 128:
             CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
               [[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]]
-              , tile_schedulers = tile_schedulers(data_type["sfd_type"])
-              )
+              , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
           else:
             CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
               [[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.ScheduleAuto]]
-              , tile_schedulers = tile_schedulers(data_type["sfd_type"])
-              )
+              , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
 
 
 
-def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version):
+def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.BlockScaledUniversal3x):
   # SM100 MMA with F4 + block scale
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
     return
 
+  grouped = is_grouped(gemm_kind)
+
   # layouts for ABC and their alignments.
   layouts = [
     [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor,    0]],
@@ -7805,7 +8023,7 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
   def tile_schedulers(sfdtype):
     # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
     # the epilogue is the traditional linear combination, for which we already have tests with stream-K.
-    if sfdtype["type"] == DataType.void:
+    if sfdtype["type"] == DataType.void or grouped:
       return [TileSchedulerType.Default]
     else:
       return [TileSchedulerType.Default, TileSchedulerType.StreamK]
@@ -7826,6 +8044,10 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
     if (is_runtime_datatype_a != is_runtime_datatype_b):
       continue
 
+    # grouped GEMM does not support runtime data type yet
+    if grouped and (is_runtime_datatype_a or is_runtime_datatype_b):
+      continue
+
     math_instructions_1sm.append(
       MathInstruction(
         instr_size,
@@ -7853,6 +8075,10 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
     if (is_runtime_datatype_a != is_runtime_datatype_b):
       continue
 
+    # grouped GEMM does not support runtime data type yet
+    if grouped and (is_runtime_datatype_a or is_runtime_datatype_b):
+      continue
+
     math_instructions_2sm.append(
       MathInstruction(
         instr_size,
@@ -7972,15 +8198,21 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
       for data_type in data_types:
         if data_type["sfd_type"]["type"] != DataType.void and (data_type["d_type"] == DataType.e2m1):
           data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
+        # E2M1 x E2M1, vector size 32, E8
+        # E2M1 x E2M1, vector size 16, UE4M3
         isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and  math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
-        nvfp4_schedule = [KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]
-        fp4_schedule   = [KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]
+        epi_schedule = to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)
+        nvfp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped)
+        fp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100, grouped)
+
+        nvfp4_schedule = [nvfp4_kernel_schedule, epi_schedule]
+        fp4_schedule   = [fp4_kernel_schedule, epi_schedule]
         CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, [nvfp4_schedule]
-          , tile_schedulers=tile_schedulers(data_type["sfd_type"])
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind
           )
         if isFp4:
           CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, [fp4_schedule]
-          , tile_schedulers=tile_schedulers(data_type["sfd_type"])
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind
           )
 
   cluster_shapes_2sm = [
@@ -8085,18 +8317,20 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
       for data_type in data_types:
         if data_type["sfd_type"]["type"] != DataType.void and (data_type["d_type"] == DataType.e2m1):
           data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
-        #   E2M1 x E2M1, vector size 32, E8
+        # E2M1 x E2M1, vector size 32, E8
         isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and  math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
 
-        nvfp4_schedule = [KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.ScheduleAuto]
-        fp4_schedule   = [KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.ScheduleAuto]
+        epi_schedule = EpilogueScheduleType.ScheduleAuto if not grouped else EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm
+        nvfp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped)
+        fp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100, grouped)
+
+        nvfp4_schedule = [nvfp4_kernel_schedule, epi_schedule]
+        fp4_schedule   = [fp4_kernel_schedule, epi_schedule]
         CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, [nvfp4_schedule]
-          , tile_schedulers=tile_schedulers(data_type["sfd_type"])
-          )
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
         if isFp4:
           CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, [fp4_schedule]
-          , tile_schedulers=tile_schedulers(data_type["sfd_type"])
-          )
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
 
 
 
@@ -8139,6 +8373,7 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
       MathOperation.multiply_add)]
 
   cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1], [4,4,1]
+                        , DynamicClusterShape
                        ]
 
   tile_schedulers = [
@@ -8237,6 +8472,7 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
   ]
 
   cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]
+                        , DynamicClusterShape
                        ]
 
   for math_inst in math_instructions_2sm:
@@ -8353,6 +8589,7 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version):
 
   cluster_shapes_1sm = [
     [1,2,1], [1,1,1], [1,4,1], [4,4,1]
+    , DynamicClusterShape
     ]
 
   tile_schedulers = [
@@ -8386,6 +8623,7 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version):
 
   cluster_shapes_2sm = [
     [2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,4,1]
+    , DynamicClusterShape
   ]
 
   for math_inst in math_instructions_2sm:
@@ -8431,6 +8669,7 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
 
   cluster_shapes_1sm = [
     [1,2,1], [1,1,1], [4,4,1]
+    , DynamicClusterShape
     ]
 
   tile_schedulers = [
@@ -8498,6 +8737,7 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
 
   cluster_shapes_2sm = [
     [2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,4,1]
+    , DynamicClusterShape
     ]
 
   for math_inst in math_instructions_2sm:
@@ -8554,6 +8794,125 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
         [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
 
 
+def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+  ]
+
+  min_cc = 100
+  max_cc = 130
+
+  epi_type = DataType.f32
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 256, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add)]
+
+  cluster_shapes_1sm = [
+    [1,2,1], [2,1,1], [1,1,1], [4,4,1]
+    , DynamicClusterShape
+  ]
+
+  tile_schedulers = [
+    TileSchedulerType.StreamK,
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type in data_types:
+      if ( data_type["a_type"] == DataType.e4m3 ) and ( data_type["b_type"] == DataType.e4m3 ) and\
+         ( data_type["d_type"] == DataType.e5m2 ):
+        continue
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 256, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    ]
+
+  cluster_shapes_2sm = [
+    [2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,4,1]
+    , DynamicClusterShape
+    ]
+
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type in data_types:
+      if ( data_type["a_type"] == DataType.e4m3 ) and ( data_type["b_type"] == DataType.e4m3 ) and\
+         ( data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      if math_inst.instruction_shape[0] == 128:
+        epi_schedule = EpilogueScheduleType.TmaWarpSpecialized2Sm
+      else:
+        epi_schedule = EpilogueScheduleType.ScheduleAuto
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
 
 def GenerateSM100(manifest, cuda_version):
   #
@@ -8570,13 +8929,19 @@ def GenerateSM100(manifest, cuda_version):
 
   GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version)
   # grouped GEMM
-  GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedGemmUniversal3x)
-  GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedGemmUniversal3x)
+  GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+  GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+
+  GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version)
+
+  # StreamK is included in regular generation
+  GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
   #
   # Block Scaled Gemm
   #
   GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
   GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+  GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version,  gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
 
 ###################################################################################################
 
@@ -8955,8 +9320,8 @@ def GenerateSM90(manifest, cuda_version):
   GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version)
   GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version)
   GenerateSM90_TensorOp_1684(manifest, cuda_version)
-  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedGemmUniversal3x)
-  GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedGemmUniversal3x)
+  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
   GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
   GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
   GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version)
diff --git a/python/cutlass_library/library.py b/python/cutlass_library/library.py
index 89e72f2b..bc2cc7b1 100644
--- a/python/cutlass_library/library.py
+++ b/python/cutlass_library/library.py
@@ -321,6 +321,12 @@ def is_complex(data_type):
       return True
   return False
 
+def is_block_scaled(gemm_kind):
+  return gemm_kind in (GemmKind.BlockScaledUniversal3x, GemmKind.GroupedBlockScaledUniversal3x)
+
+def is_grouped(gemm_kind):
+  return gemm_kind in (GemmKind.GroupedUniversal3x, GemmKind.GroupedBlockScaledUniversal3x)
+
 #
 def get_complex_from_real(real_type):
   for r, c in RealComplexBijection:
@@ -482,23 +488,32 @@ class KernelScheduleType(enum.Enum):
   TmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
   TmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
   ImplicitTmaWarpSpecializedSm90 = enum_auto()
-  
+
   TmaWarpSpecialized1SmSm100 = enum_auto()
   TmaWarpSpecialized2SmSm100 = enum_auto()
 
   PtrArrayTmaWarpSpecialized1SmSm100 = enum_auto()
   PtrArrayTmaWarpSpecialized2SmSm100 = enum_auto()
 
+  PtrArrayTmaWarpSpecialized1SmBlockScaledSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized2SmBlockScaledSm100 = enum_auto()
+  PtrArrayNvf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayNvf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayMxf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayMxf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
+
   BlockScaledTmaWarpSpecialized1SmSm100 = enum_auto()
   BlockScaledTmaWarpSpecialized2SmSm100 = enum_auto()
   Mxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
   Mxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
-  
+
   Mxf4TmaWarpSpecialized1SmSm100 = enum_auto()
   Mxf4TmaWarpSpecialized2SmSm100 = enum_auto()
   Nvf4TmaWarpSpecialized1SmSm100 = enum_auto()
   Nvf4TmaWarpSpecialized2SmSm100 = enum_auto()
-  
+
   KernelPtrArrayTmaWarpSpecializedCooperative = enum_auto()
   KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
   KernelPtrArrayTmaWarpSpecializedPingpong = enum_auto()
@@ -519,7 +534,7 @@ KernelScheduleTag = {
   KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum',
   KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum',
   KernelScheduleType.ImplicitTmaWarpSpecializedSm90: 'cutlass::conv::KernelImplicitTmaWarpSpecializedSm90',
-  
+
   KernelScheduleType.TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmSm100',
   KernelScheduleType.TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmSm100',
 
@@ -530,16 +545,25 @@ KernelScheduleTag = {
   KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100',
-  
+
   KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf4Sm100',
   KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf4Sm100',
   KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100',
   KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100',
-  
+
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative',
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum',
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong',
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100",
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100",
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100",
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100",
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100",
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100",
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100",
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100",
 }
 
 #
@@ -568,16 +592,25 @@ KernelScheduleSuffixes = {
   KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: '_2sm',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: '_q_1sm',
   KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: '_q_2sm',
-  
+
   KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
   KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
   KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
   KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100: '_o_vs16_2sm',
-  
+
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
   KernelScheduleType.KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: '_1sm',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: '_2sm',
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100: '_o_vs16_2sm',
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
 }
 
 class EpilogueScheduleType(enum.Enum):
@@ -585,6 +618,10 @@ class EpilogueScheduleType(enum.Enum):
   EpilogueTransposed = enum_auto()
   NoSmemWarpSpecialized = enum_auto()
   PtrArrayNoSmemWarpSpecialized = enum_auto()
+  NoSmemWarpSpecialized1Sm = enum_auto()
+  NoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayNoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayNoSmemWarpSpecialized2Sm = enum_auto()
   TmaWarpSpecialized = enum_auto()
   TmaWarpSpecializedCooperative = enum_auto()
   TmaWarpSpecialized1Sm = enum_auto() 
@@ -600,6 +637,10 @@ EpilogueScheduleTag = {
   EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
   EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
   EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized',
+  EpilogueScheduleType.NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized2Sm',
   EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
   EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
   EpilogueScheduleType.TmaWarpSpecialized1Sm: 'cutlass::epilogue::TmaWarpSpecialized1Sm', 
@@ -616,6 +657,10 @@ EpilogueScheduleSuffixes = {
   EpilogueScheduleType.EpilogueTransposed: '',
   EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
   EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.NoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.NoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm: '_epi_nosmem',
   EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
   EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
   EpilogueScheduleType.TmaWarpSpecialized1Sm: '', 
@@ -636,6 +681,23 @@ EpilogueFunctor3xTag = {
   EpilogueFunctor3x.LinearCombinationBlockScaleFactor: 'cutlass::epilogue::fusion::LinCombBlockScaleFactor',  
 }
 
+def to_grouped_schedule(schedule, grouped):
+  if not grouped:
+    return schedule
+
+  group_schedule_map = {
+    KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100,
+    EpilogueScheduleType.TmaWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
+    EpilogueScheduleType.TmaWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm,
+  }
+
+  return group_schedule_map[schedule]
+
 class TileSchedulerType(enum.Enum):
   Default = enum_auto()
   Persistent = enum_auto()
@@ -817,7 +879,8 @@ class GemmKind(enum.Enum):
   PlanarComplexArray = enum_auto()
   Grouped = enum_auto()
   BlockScaledUniversal3x = enum_auto()                                   
-  GroupedGemmUniversal3x = enum_auto()
+  GroupedUniversal3x = enum_auto()
+  GroupedBlockScaledUniversal3x = enum_auto()
 
 #
 GemmKindNames = {
@@ -830,7 +893,8 @@ GemmKindNames = {
   GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
   GemmKind.Grouped: "gemm_grouped",
   GemmKind.BlockScaledUniversal3x: "gemm_block_scaled",
-  GemmKind.GroupedGemmUniversal3x: "gemm_grouped",
+  GemmKind.GroupedUniversal3x: "gemm_grouped",
+  GemmKind.GroupedBlockScaledUniversal3x: "gemm_grouped_block_scaled"
 }
 
 #
diff --git a/python/cutlass_library/sm90_utils.py b/python/cutlass_library/sm90_utils.py
index 984ba33c..6e3038ec 100644
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@@ -489,7 +489,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
     if is_fp32 and (is_tn or is_nn) and (cta_n % cta_k != 0):
         return [], []
 
-    grouped = gemm_kind == GemmKind.GroupedGemmUniversal3x
+    grouped = is_grouped(gemm_kind)
     if grouped:
         # the following cases are unsupported by grouped GEMM
         if not is_aligned:
diff --git a/test/self_contained_includes/CMakeLists.txt b/test/self_contained_includes/CMakeLists.txt
index 7e7b0498..cc151e1a 100644
--- a/test/self_contained_includes/CMakeLists.txt
+++ b/test/self_contained_includes/CMakeLists.txt
@@ -75,7 +75,6 @@ set(header_files_to_check
     cute/container/array_subbyte.hpp
     cute/container/bit_field.hpp
     cute/container/cuda_types.hpp
-    cute/container/packed_tuple.hpp
     cute/container/tuple.hpp
     cute/container/type_list.hpp
 
@@ -107,13 +106,12 @@ set(header_files_to_check
     cute/arch/mma_sm70.hpp
     cute/arch/mma_sm75.hpp
     cute/arch/mma_sm80.hpp
-    cute/arch/mma_sm80_sparse.hpp
     cute/arch/mma_sm90.hpp
     cute/arch/mma_sm90_desc.hpp
     cute/arch/mma_sm90_gmma.hpp
     cute/arch/mma.hpp
     cute/arch/util.hpp
-    
+
     cute/arch/cluster_sm100.hpp
     cute/arch/copy_sm100.hpp
     cute/arch/copy_sm100_tma.hpp
@@ -121,7 +119,7 @@ set(header_files_to_check
     cute/arch/mma_sm100_desc.hpp
     cute/arch/mma_sm100_umma.hpp
     # cute/arch/tmem_allocator_sm100.hpp
-    
+
     # cute/atom
     # cute/atom/copy_atom.hpp
     # cute/atom/copy_traits.hpp
@@ -140,10 +138,10 @@ set(header_files_to_check
     cute/atom/mma_traits_sm80.hpp
     cute/atom/mma_traits_sm90.hpp
     cute/atom/mma_traits_sm90_gmma.hpp
-    
-    cute/atom/mma_traits_sm100.hpp 
+
+    cute/atom/mma_traits_sm100.hpp
     cute/atom/partitioner.hpp
-    
+
     # cutlass
     cutlass/aligned_buffer.h
     cutlass/array.h
@@ -180,7 +178,6 @@ set(header_files_to_check
     cutlass/numeric_size.h
     cutlass/numeric_types.h
     cutlass/pitch_linear_coord.h
-    cutlass/predicate.h
     cutlass/predicate_vector.h
     cutlass/quaternion.h
     cutlass/real.h
@@ -200,16 +197,16 @@ set(header_files_to_check
     cutlass/workspace.h
     cutlass/exmy_base.h
     cutlass/float_subbyte.h
-    
+
     # cutlass/platform
     cutlass/platform/platform.h
 
     # cutlass/pipeline
     cutlass/pipeline/pipeline.hpp
     cutlass/pipeline/sm90_pipeline.hpp
-    
+
     cutlass/pipeline/sm100_pipeline.hpp
-    
+
 
     # cutlass/detail
     cutlass/detail/cluster.hpp
@@ -217,18 +214,16 @@ set(header_files_to_check
     cutlass/detail/dependent_false.hpp
     cutlass/detail/helper_macros.hpp
     cutlass/detail/layout.hpp
-    cutlass/detail/mainloop_fusion_helper_bgrada.hpp
     cutlass/detail/mma.hpp
-    
+
     cutlass/detail/sm100_blockscaled_layout.hpp
-    
+
 
     # cutlass/arch
     cutlass/arch/arch.h
     cutlass/arch/barrier.h
     cutlass/arch/cache_operation.h
     cutlass/arch/config.h
-    cutlass/arch/custom_abi.h
     cutlass/arch/grid_dependency_control.h
     cutlass/arch/memory.h
     # cutlass/arch/memory_sm75.h
@@ -248,7 +243,6 @@ set(header_files_to_check
     # cutlass/arch/simd_sm60.h
     # cutlass/arch/simd_sm61.h
     cutlass/arch/reg_reconfig.h
-    cutlass/arch/tma_operation.h
     cutlass/arch/wmma.h
     # cutlass/arch/wmma_sm70.h
     # cutlass/arch/wmma_sm72.h
diff --git a/test/unit/cute/core/CMakeLists.txt b/test/unit/cute/core/CMakeLists.txt
index d74ed3a7..4469f43e 100644
--- a/test/unit/cute/core/CMakeLists.txt
+++ b/test/unit/cute/core/CMakeLists.txt
@@ -47,11 +47,9 @@ cutlass_test_unit_add_executable(
   math.cpp
   mixedbits.cpp
   nullspace.cpp
-  packed_tuple.cpp
   pointer.cpp
   reverse.cpp
   swizzle_layout.cpp
   transform.cpp
   tuple.cpp
-  tuple_find.cpp
 )
diff --git a/test/unit/cute/core/packed_tuple.cpp b/test/unit/cute/core/packed_tuple.cpp
deleted file mode 100644
index 77584e88..00000000
--- a/test/unit/cute/core/packed_tuple.cpp
+++ /dev/null
@@ -1,581 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass_unit_test.h"
-
-#include <cutlass/trace.h>
-
-#include <cassert>
-#include <cstdint>
-
-#include <tuple>
-#include <cute/container/tuple.hpp>
-#include <cute/container/packed_tuple.hpp>
-#include <cute/algorithm/tuple_algorithms.hpp>
-#include <cute/tensor.hpp>
-
-namespace pt_test {
-
-template <class T>
-struct Nonempty {
-  T datum;
-
-  Nonempty(T const& t) : datum{t} {}
-
-  friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
-    return lhs.datum == rhs.datum;
-  }
-
-  friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-template <int V>
-struct Empty {
-  template <int W>
-  friend bool operator==(Empty<V> const&, Empty<W> const&) {
-    return V == W;
-  }
-
-  template <int W>
-  friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-// std::tuple
-static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
-static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
-static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
-static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
-
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-// cute::tuple
-static_assert(cute::is_standard_layout_v<cute::tuple<>>); // it happens to be
-static_assert(cute::is_standard_layout_v<cute::tuple<int>>); // it happens to be
-static_assert(cute::is_standard_layout_v<cute::tuple<double>>); // it happens to be
-static_assert(not cute::is_standard_layout_v<cute::tuple<int, double>>); // it's not
-#endif // CUTLASS_USE_PACKED_TUPLE
-
-// cute::packed_tuple
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<>>);
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int>>);
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<double>>);
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, double>>);  // it is
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, int, int, int>>);  // it is
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<int, int>, int>>);  // it is
-static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<Empty<0>, Empty<0>>, int>>);  // it is
-
-//////////////////////////////////////////////////////////////////////
-// packed_tuple test starts here
-//////////////////////////////////////////////////////////////////////
-
-template <
-  class ExpectedPackedType,
-  size_t ExpectedPackedSize,
-  class ... Args>
-constexpr void
-test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
-{
-  using cute::packed_tuple;
-
-  if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
-    static_assert(cute::is_standard_layout_v<packed_tuple<Args...>>);
-  }
-
-  if constexpr ((cute::is_empty_v<Args> && ...)) {
-    static_assert(cute::is_empty_v<packed_tuple<Args...>>);
-  }
-
-  static_assert(cute::tuple_size_v<packed_tuple<Args...>> == sizeof...(Args));
-
-  auto test_element = [unpacked] (auto index) {
-    static_assert(cute::is_same_v<
-      std::tuple_element_t<index, packed_tuple<Args...>>,
-      std::tuple_element_t<index, std::tuple<Args...>>
-    >);
-
-    packed_tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_packed_tuple(a...); });
-    EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
-  };
-  cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
-}
-
-void test_packed_type_aliases() {
-  using cute::packed_tuple;
-  test_packed_type_alias<packed_tuple<>, 0>({}, {});
-
-  test_packed_type_alias<packed_tuple<int>, 1, int>({7}, {7});
-  test_packed_type_alias<packed_tuple<double>, 1, double>({1.5}, {1.5});
-
-  // Make sure that class types are handled the same as scalar types
-  test_packed_type_alias<packed_tuple<Nonempty<int>>, 1, Nonempty<int>>(
-    {Nonempty{7}}, {Nonempty{7}});
-  test_packed_type_alias<packed_tuple<Nonempty<double>>, 1, Nonempty<double>>(
-    {Nonempty{1.5}}, {Nonempty{1.5}});
-
-  test_packed_type_alias<packed_tuple<>, 0, Empty<0>>({}, {});
-  test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>>(
-    {}, {Empty<0>{}, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
-    {}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
-
-  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int>(
-    {7}, {Empty<0>{}, 7});
-  test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>>(
-    {7}, {7, Empty<0>{}});
-
-  test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>, Empty<1>>(
-    {7}, {7, Empty<0>{}, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int, Empty<1>>(
-    {7}, {Empty<0>{}, 7, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, Empty<1>, int>(
-    {7}, {Empty<0>{}, Empty<1>{}, 7});
-
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
-    {7, 1.5}, {7, 1.5, Empty<0>{}});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double>(
-    {7, 1.5}, {7, Empty<0>{}, 1.5});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
-    {7, 1.5}, {7, 1.5, Empty<0>{}});
-
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
-    {7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
-    {7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
-  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
-    {7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
-  test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
-    {7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
-  test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
-    {7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
-
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, Empty<0>, int, double, float>(
-    {7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, Empty<0>, double, float>(
-    {7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, Empty<0>, float>(
-    {7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
-  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, float, Empty<0>>(
-    {7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
-}
-
-template <class Tuple, size_t Which, class ExpectedElementType>
-constexpr bool test_tuple_element() {
-  return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
-}
-
-void test_tuple_elements() {
-  using cute::packed_tuple;
-
-  static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
-  static_assert(test_tuple_element<packed_tuple<Empty<0>>, 0, Empty<0>>());
-}
-
-// A default-constructible type.
-template <size_t Value>
-struct DefaultConstructible {};
-
-void test_default_constructibility() {
-  using cute::packed_tuple;
-  {
-    [[maybe_unused]] packed_tuple<> t_p_0;
-    [[maybe_unused]] packed_tuple<DefaultConstructible<0>> t_p_1;
-    [[maybe_unused]] packed_tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
-    [[maybe_unused]] packed_tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
-  }
-}
-
-void test_sizes_and_not_storing_empty_types() {
-  using cute::packed_tuple;
-
-  [[maybe_unused]] packed_tuple<
-    int,
-    pt_test::Empty<0>,
-    double
-  > pt{42, pt_test::Empty<0>{}, 1.5};
-  static_assert(cute::is_standard_layout_v<decltype(pt)>);
-  // packed_result_type must only store the packed tuple,
-  // and not the integer_sequence(s) used to access it.
-  // The latter can be represented entirely at compile time as types.
-  struct { int i; double j; } IntDouble;
-  static_assert(sizeof(pt) == sizeof(IntDouble));
-
-  EXPECT_EQ(cute::get<0>(pt), 42);
-  EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
-  EXPECT_EQ(cute::get<2>(pt), 1.5);
-  packed_tuple<
-    pt_test::Empty<0>,
-    pt_test::Empty<1>,
-    packed_tuple<
-      pt_test::Empty<0>,
-      pt_test::Empty<1>,
-      packed_tuple<pt_test::Empty<0>, packed_tuple<>>
-    >
-  > pt_empty{};
-  static_assert(cute::is_empty_v<decltype(pt_empty)>);
-  static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
-  static_assert(sizeof(pt_empty) == 1);
-
-  // Template arguments must be default constructible,
-  // and packed_tuple itself needs a default constructor.
-  [[maybe_unused]] packed_tuple<
-    packed_tuple<int, pt_test::Empty<2>>,
-    double,
-    pt_test::Empty<3>> pt2;
-  static_assert(cute::is_standard_layout_v<decltype(pt2)>);
-
-  // cute::packed_tuple, like the original cute::tuple, does not
-  // promise to have working CTAD (constructor template argument
-  // deduction).
-  [[maybe_unused]] packed_tuple<
-    packed_tuple<int, pt_test::Empty<0>>,
-    pt_test::Empty<1>
-  > pt3{
-    packed_tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
-    pt_test::Empty<1>{}
-  };
-  static_assert(cute::is_standard_layout_v<decltype(pt3)>);
-  static_assert(cute::is_same_v<
-    cute::tuple_element_t<0, decltype(pt3)>,
-    packed_tuple<int, pt_test::Empty<0>>>);
-  static_assert(cute::is_same_v<
-    cute::tuple_element_t<1, decltype(pt3)>,
-    pt_test::Empty<1>>);
-  static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
-
-  packed_tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
-  auto pt3_0_1 = cute::get<1>(pt3_0);
-  static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
-
-  EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
-  EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
-}
-
-} // namespace test
-
-TEST(CuTe_core, PackedTuple2)
-{
-  CUTLASS_TRACE_HOST("-------------------------------");
-  CUTLASS_TRACE_HOST("packed_tuple");
-  CUTLASS_TRACE_HOST("-------------------------------");
-
-  pt_test::test_packed_type_aliases();
-  pt_test::test_tuple_elements();
-  pt_test::test_default_constructibility();
-  pt_test::test_sizes_and_not_storing_empty_types();
-}
-
-TEST(CuTe_core, PackedTuple2Get) {
-  using cute::packed_tuple;
-  using pt_test::Empty;
-  using pt_test::Nonempty;
-
-  {
-    using tuple_type = packed_tuple<int>;
-    tuple_type pt{42};
-    static_assert(cute::tuple_size_v<tuple_type> == 1u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    cute::get<0>(pt) = 43;
-    EXPECT_EQ(cute::get<0>(pt), 43);
-  }
-  {
-    using tuple_type = packed_tuple<int>;
-    tuple_type const pt{42};
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
-  }
-  {
-    EXPECT_EQ(cute::get<0>(packed_tuple<int>{42}), 42);
-  }
-
-  {
-    using tuple_type = packed_tuple<pt_test::Empty<0>>;
-    tuple_type pt;
-    static_assert(cute::tuple_size_v<tuple_type> == 1u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
-    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
-  }
-  {
-    using tuple_type = packed_tuple<pt_test::Empty<0>>;
-    tuple_type const pt;
-    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
-  }
-  {
-    using tuple_type = packed_tuple<pt_test::Empty<0>>;
-    EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
-  }
-
-  {
-    using tuple_type = packed_tuple<int, double>;
-    tuple_type pt{1, 2.5};
-    static_assert(cute::tuple_size_v<tuple_type> == 2u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    cute::get<0>(pt) = 2;
-    EXPECT_EQ(cute::get<0>(pt), 2);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    cute::get<1>(pt) = 3.5;
-    EXPECT_EQ(cute::get<1>(pt), 3.5);
-  }
-  {
-    using tuple_type = packed_tuple<int, double>;
-    tuple_type const pt{1, 2.5};
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
-  }
-  {
-    using tuple_type = packed_tuple<int, double>;
-    EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
-    EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
-  }
-
-  {
-    using tuple_type = packed_tuple<Empty<0>, double>;
-    tuple_type pt{Empty<0>{}, 2.5};
-    static_assert(cute::tuple_size_v<tuple_type> == 2u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
-    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    cute::get<1>(pt) = 3.5;
-    EXPECT_EQ(cute::get<1>(pt), 3.5);
-  }
-  {
-    using tuple_type = packed_tuple<Empty<0>, double>;
-    tuple_type const pt{Empty<0>{}, 2.5};
-    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
-    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
-  }
-  {
-    using tuple_type = packed_tuple<Empty<0>, double>;
-    EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
-  }
-
-  {
-    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
-    tuple_type pt{1, 2.5, Nonempty{3.25f}};
-    static_assert(cute::tuple_size_v<tuple_type> == 3u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-
-    cute::get<0>(pt) = 42;
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    cute::get<1>(pt) = 4.5;
-    EXPECT_EQ(cute::get<1>(pt), 4.5);
-    cute::get<2>(pt) = Nonempty<float>{3.75f};
-    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
-  }
-  {
-    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
-    tuple_type const pt{1, 2.5, Nonempty{3.25f}};
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), 2.5);
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-  }
-  {
-    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
-    EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
-    EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
-    EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
-  }
-
-  {
-    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
-    packed_tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
-    static_assert(cute::tuple_size_v<tuple_type> == 3u);
-    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
-    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-
-    cute::get<0>(pt) = 42;
-    EXPECT_EQ(cute::get<0>(pt), 42);
-    cute::get<2>(pt) = Nonempty<float>{3.75f};
-    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
-  }
-  {
-    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
-    tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
-    EXPECT_EQ(cute::get<0>(pt), 1);
-    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
-    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
-  }
-  {
-    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
-    EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
-    EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
-    EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
-  }
-}
-
-namespace pt_test {
-
-// An empty class type to which Empty is convertible.
-template<int Value>
-struct ConvertibleFromEmpty {
-  constexpr ConvertibleFromEmpty() = default;
-  constexpr ConvertibleFromEmpty(Empty<Value>) {}
-
-  template <int OtherValue>
-  friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
-    return Value == OtherValue;
-  }
-
-  template <int OtherValue>
-  friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
-    return !(lhs == rhs);
-  }
-};
-
-} // end namespace pt_test
-
-TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
-  // Make sure that packed_tuple's default constructor is constexpr.
-  // MSVC makes this a bit more challenging than usual.
-
-  using pt_test::Empty;
-  {
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
-  }
-  {
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
-    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
-  }
-}
-
-TEST(CuTe_core, PackedTupleConvertingConstruction) {
-  using cute::packed_tuple;
-  using pt_test::ConvertibleFromEmpty;
-  using pt_test::Empty;
-  using pt_test::Nonempty;  
-
-  {
-    using tuple_type = cute::tuple<Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(7);
-    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(7);
-    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{});
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-  }
-  {
-    using tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{});
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-  }
-
-  {
-    using tuple_type = cute::tuple<float, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(1.5f, 7);
-    EXPECT_EQ(cute::get<0>(t), 1.5f);
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<float, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(1.5f, 7);
-    EXPECT_EQ(cute::get<0>(t), 1.5f);
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-
-  {
-    using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<Empty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-
-  {
-    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-  {
-    using tuple_type = packed_tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
-    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
-    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
-    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
-  }
-
-  {
-    using inner_tuple_type = cute::tuple<Empty<0>>;
-    using outer_tuple_type = cute::tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-  {
-    using inner_tuple_type = packed_tuple<Empty<0>>;
-    using outer_tuple_type = packed_tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-  {
-    using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
-    using outer_tuple_type = cute::tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-  {
-    using inner_tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
-    using outer_tuple_type = packed_tuple<inner_tuple_type>;
-    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
-  }
-
-}
-
-
diff --git a/test/unit/cute/core/tuple.cpp b/test/unit/cute/core/tuple.cpp
index f1efb36e..ea31edd9 100644
--- a/test/unit/cute/core/tuple.cpp
+++ b/test/unit/cute/core/tuple.cpp
@@ -32,6 +32,13 @@
 #include "cutlass_unit_test.h"
 
 #include <cutlass/trace.h>
+
+#include <cassert>
+#include <cstdint>
+
+#include <tuple>
+#include <cute/container/tuple.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
 #include <cute/tensor.hpp>
 
 TEST(CuTe_core, Tuple)
@@ -120,6 +127,11 @@ TEST(CuTe_core, Tuple)
   ASSERT_TRUE(sizeof(tuple_3h_m_type) == 12);
   ASSERT_TRUE(!std::is_empty<tuple_3h_m_type>::value);
 
+  ASSERT_TRUE(sizeof(cute::tuple<_1, _1, cute::tuple<int32_t>>) == 4);
+  ASSERT_TRUE(sizeof(cute::tuple<_1, _0, cute::tuple<int32_t>>) == 4);
+  ASSERT_TRUE(sizeof(cute::tuple<_1, cute::tuple<_1, int32_t>>) == 4);
+  ASSERT_TRUE(sizeof(cute::tuple<_1, cute::tuple<_0, int32_t>>) == 4);
+
   CUTLASS_TRACE_HOST("-------------------------------");
   CUTLASS_TRACE_HOST("SIMPLE TUPLE OPS");
   CUTLASS_TRACE_HOST("-------------------------------");
@@ -264,3 +276,588 @@ TEST(CuTe_core, Tuple)
     CUTLASS_TRACE_HOST("a(_,1,_,(1,2)) = " << dice(make_coord(_,1,_,make_coord(1,2)), a));
   }
 }
+
+namespace pt_test {
+
+template <class T>
+struct Nonempty {
+  T datum;
+
+  Nonempty(T const& t) : datum{t} {}
+
+  friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
+    return lhs.datum == rhs.datum;
+  }
+
+  friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+template <int V>
+struct Empty {
+  template <int W>
+  friend bool operator==(Empty<V> const&, Empty<W> const&) {
+    return V == W;
+  }
+
+  template <int W>
+  friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+// std::tuple
+static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
+static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
+static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
+static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
+
+// cute::tuple
+static_assert(cute::is_standard_layout_v<cute::tuple<>>);
+static_assert(cute::is_standard_layout_v<cute::tuple<int>>);
+static_assert(cute::is_standard_layout_v<cute::tuple<double>>);
+static_assert(cute::is_standard_layout_v<cute::tuple<int, double>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::tuple<int, int, int, int>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::tuple<int, cute::tuple<int, int>, int>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::tuple<int, cute::tuple<Empty<0>, Empty<0>>, int>>);  // it is
+
+//////////////////////////////////////////////////////////////////////
+// tuple test starts here
+//////////////////////////////////////////////////////////////////////
+
+template <
+  class ExpectedPackedType,
+  size_t ExpectedPackedSize,
+  class ... Args>
+constexpr void
+test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
+{
+  using cute::tuple;
+
+  if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
+    static_assert(cute::is_standard_layout_v<tuple<Args...>>);
+  }
+
+  if constexpr ((cute::is_empty_v<Args> && ...)) {
+    static_assert(cute::is_empty_v<tuple<Args...>>);
+  }
+
+  static_assert(cute::tuple_size_v<tuple<Args...>> == sizeof...(Args));
+
+  auto test_element = [unpacked] (auto index) {
+    static_assert(cute::is_same_v<
+      std::tuple_element_t<index, tuple<Args...>>,
+      std::tuple_element_t<index, std::tuple<Args...>>
+    >);
+
+    tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_tuple(a...); });
+    EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
+  };
+  cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
+}
+
+void test_packed_type_aliases() {
+  using cute::tuple;
+  test_packed_type_alias<tuple<>, 0>({}, {});
+
+  test_packed_type_alias<tuple<int>, 1, int>({7}, {7});
+  test_packed_type_alias<tuple<double>, 1, double>({1.5}, {1.5});
+
+  // Make sure that class types are handled the same as scalar types
+  test_packed_type_alias<tuple<Nonempty<int>>, 1, Nonempty<int>>(
+    {Nonempty{7}}, {Nonempty{7}});
+  test_packed_type_alias<tuple<Nonempty<double>>, 1, Nonempty<double>>(
+    {Nonempty{1.5}}, {Nonempty{1.5}});
+
+  test_packed_type_alias<tuple<>, 0, Empty<0>>({}, {});
+  test_packed_type_alias<tuple<>, 0, Empty<0>, Empty<1>>(
+    {}, {Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
+    {}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
+
+  test_packed_type_alias<tuple<int>, 1, Empty<0>, int>(
+    {7}, {Empty<0>{}, 7});
+  test_packed_type_alias<tuple<int>, 1, int, Empty<0>>(
+    {7}, {7, Empty<0>{}});
+
+  test_packed_type_alias<tuple<int>, 1, int, Empty<0>, Empty<1>>(
+    {7}, {7, Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<tuple<int>, 1, Empty<0>, int, Empty<1>>(
+    {7}, {Empty<0>{}, 7, Empty<1>{}});
+  test_packed_type_alias<tuple<int>, 1, Empty<0>, Empty<1>, int>(
+    {7}, {Empty<0>{}, Empty<1>{}, 7});
+
+  test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}});
+  test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, double>(
+    {7, 1.5}, {7, Empty<0>{}, 1.5});
+  test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}});
+
+  test_packed_type_alias<tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
+    {7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
+  test_packed_type_alias<tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
+    {7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
+  test_packed_type_alias<tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
+    {7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
+  test_packed_type_alias<tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
+    {7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
+
+  test_packed_type_alias<tuple<int, double, float>, 3, Empty<0>, int, double, float>(
+    {7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
+  test_packed_type_alias<tuple<int, double, float>, 3, int, Empty<0>, double, float>(
+    {7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
+  test_packed_type_alias<tuple<int, double, float>, 3, int, double, Empty<0>, float>(
+    {7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
+  test_packed_type_alias<tuple<int, double, float>, 3, int, double, float, Empty<0>>(
+    {7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
+}
+
+template <class Tuple, size_t Which, class ExpectedElementType>
+constexpr bool test_tuple_element() {
+  return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
+}
+
+void test_tuple_elements() {
+  using cute::tuple;
+
+  static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
+  static_assert(test_tuple_element<tuple<Empty<0>>, 0, Empty<0>>());
+}
+
+// A default-constructible type.
+template <size_t Value>
+struct DefaultConstructible {};
+
+void test_default_constructibility() {
+  using cute::tuple;
+  {
+    [[maybe_unused]] tuple<> t_p_0;
+    [[maybe_unused]] tuple<DefaultConstructible<0>> t_p_1;
+    [[maybe_unused]] tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
+    [[maybe_unused]] tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
+  }
+}
+
+void test_sizes_and_not_storing_empty_types() {
+  using cute::tuple;
+
+  [[maybe_unused]] tuple<
+    int,
+    pt_test::Empty<0>,
+    double
+  > pt{42, pt_test::Empty<0>{}, 1.5};
+  static_assert(cute::is_standard_layout_v<decltype(pt)>);
+  // packed_result_type must only store the packed tuple,
+  // and not the integer_sequence(s) used to access it.
+  // The latter can be represented entirely at compile time as types.
+  struct { int i; double j; } IntDouble;
+  static_assert(sizeof(pt) == sizeof(IntDouble));
+
+  EXPECT_EQ(cute::get<0>(pt), 42);
+  EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
+  EXPECT_EQ(cute::get<2>(pt), 1.5);
+  tuple<
+    pt_test::Empty<0>,
+    pt_test::Empty<1>,
+    tuple<
+      pt_test::Empty<0>,
+      pt_test::Empty<1>,
+      tuple<pt_test::Empty<0>, tuple<>>
+    >
+  > pt_empty{};
+  static_assert(cute::is_empty_v<decltype(pt_empty)>);
+  static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
+  static_assert(sizeof(pt_empty) == 1);
+
+  // Template arguments must be default constructible,
+  // and tuple itself needs a default constructor.
+  [[maybe_unused]] tuple<
+    tuple<int, pt_test::Empty<2>>,
+    double,
+    pt_test::Empty<3>> pt2;
+  static_assert(cute::is_standard_layout_v<decltype(pt2)>);
+
+  // cute::tuple, like the original cute::tuple, does not
+  // promise to have working CTAD (constructor template argument
+  // deduction).
+  [[maybe_unused]] tuple<
+    tuple<int, pt_test::Empty<0>>,
+    pt_test::Empty<1>
+  > pt3{
+    tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
+    pt_test::Empty<1>{}
+  };
+  static_assert(cute::is_standard_layout_v<decltype(pt3)>);
+  static_assert(cute::is_same_v<
+    cute::tuple_element_t<0, decltype(pt3)>,
+    tuple<int, pt_test::Empty<0>>>);
+  static_assert(cute::is_same_v<
+    cute::tuple_element_t<1, decltype(pt3)>,
+    pt_test::Empty<1>>);
+  static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
+
+  tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
+  auto pt3_0_1 = cute::get<1>(pt3_0);
+  static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
+
+  EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
+  EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
+}
+
+} // namespace test
+
+TEST(CuTe_core, PackedTuple2)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("tuple");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  pt_test::test_packed_type_aliases();
+  pt_test::test_tuple_elements();
+  pt_test::test_default_constructibility();
+  pt_test::test_sizes_and_not_storing_empty_types();
+}
+
+TEST(CuTe_core, PackedTuple2Get) {
+  using cute::tuple;
+  using pt_test::Empty;
+  using pt_test::Nonempty;
+
+  {
+    using tuple_type = tuple<int>;
+    tuple_type pt{42};
+    static_assert(cute::tuple_size_v<tuple_type> == 1u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<0>(pt) = 43;
+    EXPECT_EQ(cute::get<0>(pt), 43);
+  }
+  {
+    using tuple_type = tuple<int>;
+    tuple_type const pt{42};
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
+  }
+  {
+    EXPECT_EQ(cute::get<0>(tuple<int>{42}), 42);
+  }
+
+  {
+    using tuple_type = tuple<pt_test::Empty<0>>;
+    tuple_type pt;
+    static_assert(cute::tuple_size_v<tuple_type> == 1u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
+    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
+  }
+  {
+    using tuple_type = tuple<pt_test::Empty<0>>;
+    tuple_type const pt;
+    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
+  }
+  {
+    using tuple_type = tuple<pt_test::Empty<0>>;
+    EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
+  }
+
+  {
+    using tuple_type = tuple<int, double>;
+    tuple_type pt{1, 2.5};
+    static_assert(cute::tuple_size_v<tuple_type> == 2u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    cute::get<0>(pt) = 2;
+    EXPECT_EQ(cute::get<0>(pt), 2);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    cute::get<1>(pt) = 3.5;
+    EXPECT_EQ(cute::get<1>(pt), 3.5);
+  }
+  {
+    using tuple_type = tuple<int, double>;
+    tuple_type const pt{1, 2.5};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
+  }
+  {
+    using tuple_type = tuple<int, double>;
+    EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
+    EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
+  }
+
+  {
+    using tuple_type = tuple<Empty<0>, double>;
+    tuple_type pt{Empty<0>{}, 2.5};
+    static_assert(cute::tuple_size_v<tuple_type> == 2u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    cute::get<1>(pt) = 3.5;
+    EXPECT_EQ(cute::get<1>(pt), 3.5);
+  }
+  {
+    using tuple_type = tuple<Empty<0>, double>;
+    tuple_type const pt{Empty<0>{}, 2.5};
+    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
+  }
+  {
+    using tuple_type = tuple<Empty<0>, double>;
+    EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
+  }
+
+  {
+    using tuple_type = tuple<int, double, Nonempty<float>>;
+    tuple_type pt{1, 2.5, Nonempty{3.25f}};
+    static_assert(cute::tuple_size_v<tuple_type> == 3u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+
+    cute::get<0>(pt) = 42;
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<1>(pt) = 4.5;
+    EXPECT_EQ(cute::get<1>(pt), 4.5);
+    cute::get<2>(pt) = Nonempty<float>{3.75f};
+    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
+  }
+  {
+    using tuple_type = tuple<int, double, Nonempty<float>>;
+    tuple_type const pt{1, 2.5, Nonempty{3.25f}};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+  }
+  {
+    using tuple_type = tuple<int, double, Nonempty<float>>;
+    EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
+    EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
+    EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
+  }
+
+  {
+    using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
+    tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
+    static_assert(cute::tuple_size_v<tuple_type> == 3u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+
+    cute::get<0>(pt) = 42;
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<2>(pt) = Nonempty<float>{3.75f};
+    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
+  }
+  {
+    using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
+    tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+  }
+  {
+    using tuple_type = tuple<int, Empty<0>, Nonempty<float>>;
+    EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
+    EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
+    EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
+  }
+}
+
+namespace pt_test {
+
+// An empty class type to which Empty is convertible.
+template <int Value>
+struct ConvertibleFromEmpty {
+  constexpr ConvertibleFromEmpty() = default;
+  constexpr ConvertibleFromEmpty(Empty<Value>) {}
+
+  template <int OtherValue>
+  friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
+    return Value == OtherValue;
+  }
+
+  template <int OtherValue>
+  friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+} // end namespace pt_test
+
+TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
+  // Make sure that tuple's default constructor is constexpr.
+  // MSVC makes this a bit more challenging than usual.
+
+  using pt_test::Empty;
+  {
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
+  }
+  {
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
+  }
+}
+
+TEST(CuTe_core, PackedTupleConvertingConstruction) {
+  using cute::tuple;
+  using pt_test::ConvertibleFromEmpty;
+  using pt_test::Empty;
+  using pt_test::Nonempty;
+
+  {
+    using tuple_type = cute::tuple<Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(7);
+    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(7);
+    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{});
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+  }
+  {
+    using tuple_type = tuple<ConvertibleFromEmpty<0>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{});
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+  }
+
+  {
+    using tuple_type = cute::tuple<float, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(1.5f, 7);
+    EXPECT_EQ(cute::get<0>(t), 1.5f);
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<float, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(1.5f, 7);
+    EXPECT_EQ(cute::get<0>(t), 1.5f);
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<Empty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using inner_tuple_type = cute::tuple<Empty<0>>;
+    using outer_tuple_type = cute::tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = tuple<Empty<0>>;
+    using outer_tuple_type = tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
+    using outer_tuple_type = cute::tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = tuple<ConvertibleFromEmpty<0>>;
+    using outer_tuple_type = tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+}
+
+namespace test {
+
+template <size_t ExpectedIndex, class X, class Tuple>
+void test_tuple_find(Tuple const& t) {
+  auto index = cute::find<X>(t);
+  static_assert(decltype(index)::value == ExpectedIndex);
+}
+
+template <template <class...> class Tuple>
+void test_tuple_find_all() {
+  using test::test_tuple_find;
+  using cute::_1;
+  using cute::_2;
+  using cute::_4;
+
+  test_tuple_find<0, _1>(Tuple<_1>{});
+  test_tuple_find<1, _2>(Tuple<_1>{});
+  test_tuple_find<0, int>(Tuple<int>{7});
+
+  test_tuple_find<0, _1>(Tuple<_1, _2>{});
+  test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
+  test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
+  test_tuple_find<1, _2>(Tuple<_1, _2>{});
+  test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
+  test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
+
+  test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
+
+  test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
+
+  test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
+}
+
+} // end namespace test
+
+TEST(CuTe_core, TupleFind)
+{
+  test::test_tuple_find_all<cute::tuple>();
+}
diff --git a/test/unit/cute/core/tuple_find.cpp b/test/unit/cute/core/tuple_find.cpp
deleted file mode 100644
index 7ce1e6a5..00000000
--- a/test/unit/cute/core/tuple_find.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass_unit_test.h"
-
-#include <cutlass/trace.h>
-#include <cute/container/packed_tuple.hpp>
-#include <cute/container/tuple.hpp>
-
-namespace test {
-
-template<size_t ExpectedIndex, class X, class Tuple>
-void test_tuple_find(Tuple const& t) {
-  auto index = cute::find<X>(t);
-  static_assert(decltype(index)::value == ExpectedIndex);
-}
-
-template<template<class...> class Tuple>
-void test_tuple_find_all() {
-  using test::test_tuple_find;
-  using cute::_1;
-  using cute::_2;
-  using cute::_4;
-
-  test_tuple_find<0, _1>(Tuple<_1>{});
-  test_tuple_find<0, int>(Tuple<int>{7});
-
-  test_tuple_find<0, _1>(Tuple<_1, _2>{});
-  test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
-  test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
-  test_tuple_find<1, _2>(Tuple<_1, _2>{});
-  test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
-  test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
-
-  test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
-  test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
-  test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
-  test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
-  test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
-  test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
-  test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
-
-  test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
-  test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
-  test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
-  test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
-  test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
-  test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
-  test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
-
-  test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
-  test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
-  test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
-  test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
-  test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
-  test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
-  test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
-}
-
-} // end namespace test
-
-
-TEST(CuTe_core, TupleFind)
-{
-  test::test_tuple_find_all<cute::tuple>();
-}
-
-// If cute::tuple is not simply an alias for cute::packed_tuple,
-// then test cute::packed_tuple separately.
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-TEST(CuTe_core, PackedTupleFind)
-{
-  test::test_tuple_find_all<cute::packed_tuple>();
-}
-#endif // CUTLASS_USE_PACKED_TUPLE
diff --git a/test/unit/cute/msvc_compilation/tuple.cpp b/test/unit/cute/msvc_compilation/tuple.cpp
index c4cf11f4..dc234af4 100644
--- a/test/unit/cute/msvc_compilation/tuple.cpp
+++ b/test/unit/cute/msvc_compilation/tuple.cpp
@@ -53,8 +53,6 @@ private:
 template<class Integral, Integral Value>
 using IC = std::integral_constant<Integral, Value>;
 
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-
 TEST(CuTe_core_msvc_compilation, TupleAssignment)
 {
   CUTLASS_TRACE_HOST("-------------------------------");
@@ -64,30 +62,10 @@ TEST(CuTe_core_msvc_compilation, TupleAssignment)
   using forty_two_type = IC<int, 42>;
   using forty_three_type = IC<size_t, 43>;
 
-  using ebo_s_type = cute::detail::EBO<0, forty_two_type>;
-  [[maybe_unused]] ebo_s_type ebo_s;
-  static_assert(std::is_same_v<decltype(cute::detail::getv(ebo_s)), forty_two_type>);
-
-  using ebo_d_type = cute::detail::EBO<1, size_t>;
-  [[maybe_unused]] ebo_d_type ebo_d(43u);
-  assert(ebo_d.t_ == 43u);
-  static_assert(std::is_same_v<std::remove_const_t<std::remove_reference_t<decltype(cute::detail::getv(ebo_d))>>, size_t > );
-  assert(cute::detail::getv(ebo_d) == 43u);
-
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb0{
-          41, forty_two_type{}, size_t(43u) };
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb1;
-
   int val41 = ConvertibleTo{41};
   assert(val41 == 41);
   size_t val43 = ConvertibleTo{size_t(43u)};
   assert(val43 == size_t{43u});
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb2{
-        ConvertibleTo{41}, forty_two_type{}, ConvertibleTo{size_t(43u)}};
-
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3{ 41 };
-  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3a{ 42 };
-  tb3 = tb3a;
 
   using tuple_0d_type = cute::tuple<>;
   using tuple_1d_d_type = cute::tuple<int>;
@@ -106,7 +84,6 @@ TEST(CuTe_core_msvc_compilation, TupleAssignment)
   // 'TupleBase<int, unsigned __int64>' is not a base or member"
   t3 = t3a;
 }
-#endif // CUTLASS_USE_PACKED_TUPLE
 
 TEST(CuTe_core_msvc_compilation, TupleGetSingleInteger)
 {
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 35d48212..9a6a76fc 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -51,12 +51,8 @@ endfunction()
 
 ################################################################################
 
-
 add_subdirectory(sm100_blockscaled_tensorop_gemm)
 add_subdirectory(sm100_tensorop_gemm)
-
-
-
 cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_simt
 
@@ -833,7 +829,7 @@ endif()
 
 
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 
 cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_16b_tensorop_sm100_ptr_array
diff --git a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
index 6d74d99b..7d5609e5 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -2204,7 +2204,7 @@ bool TestSmall(double alpha = 1.0, double beta = 1.0,
   
   static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
   alignment_bits = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
-  // For fp4 and fp6 QMMA kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
+  // For fp4 and fp6 kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
   int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);
 
   
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
index a7656fca..758c6475 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
@@ -30,7 +30,7 @@
 
 #
 
-if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
   cutlass_test_unit_gemm_device_sm100_blockscaled
   DEPENDS
@@ -57,7 +57,7 @@ cutlass_test_unit_gemm_device_add_executable(
   nvf4_nvf4_f16_nvfp4_epilogue.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
 
   BATCH_SOURCES ON
@@ -67,7 +67,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf4_mxf4_void_f16_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
 
   BATCH_SOURCES ON
@@ -77,7 +77,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf6_mxf6_void_bf16_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
 
   BATCH_SOURCES ON
@@ -87,7 +87,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf8_mxf8_void_f8_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
 
   BATCH_SOURCES ON
@@ -97,7 +97,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf6_mxf8_void_f32_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
 
   BATCH_SOURCES ON
@@ -107,7 +107,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf8_mxf6_f16_f8_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
 
   BATCH_SOURCES ON
@@ -117,7 +117,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf4_mxf8_bf16_bf16_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
 
   BATCH_SOURCES ON
@@ -127,7 +127,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf8_mxf4_f16_bf16_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
 
   BATCH_SOURCES ON
@@ -137,7 +137,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   mxf6_mxf4_f16_f16_nt_layout.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
 
   BATCH_SOURCES ON
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
index 67831954..9feca466 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -181,8 +179,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_256,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -190,12 +186,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -254,8 +250,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_256,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -263,12 +257,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
index 4fa2b750..a5b3d725 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
@@ -109,8 +109,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -118,7 +116,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_192,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_256,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -337,12 +329,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_192,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -410,7 +400,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_256,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
index 3d4d1df3..dca9c2d1 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,7 +258,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
index ee5251f3..9081dd56 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
index 4740641e..c971a673 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,7 +330,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
index 4cf8a4e1..0d1d8a58 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,7 +330,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
index 2823d71d..c5843e29 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,7 +258,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
index 8add2c26..ca4d7719 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
index b5f0722b..568d7a54 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
index 115353b8..a9e2ddf1 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,12 +259,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 12
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,7 +401,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 25
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,12 +472,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 25
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
index 73bf37ce..7aabf9ae 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
index 9fb1afdd..c333fcda 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
index fab68dc5..a5632057 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
index f733d47f..fe3cbcd5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,7 +187,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
index 34468a60..32257b9e 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x1
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,7 +115,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x2
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x2
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x2
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x2
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
index 33e36aa4..b84b184e 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x1
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x1
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x2
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x2
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x2
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x2
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
index 965de2c2..7c200371 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
@@ -107,8 +107,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -116,7 +114,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -181,8 +179,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -190,12 +186,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -255,8 +251,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -264,12 +258,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -328,8 +322,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -337,7 +329,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -401,8 +393,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -410,12 +400,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -474,8 +464,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -483,12 +471,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
index 91a9e100..a717ed19 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
@@ -108,8 +108,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -117,12 +115,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -182,8 +180,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -191,12 +187,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -256,8 +252,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -265,7 +259,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -329,8 +323,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -338,12 +330,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -402,8 +394,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -411,12 +401,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -475,8 +465,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -484,7 +472,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
index 3501f6ec..0bb55b26 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
@@ -107,8 +107,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -116,7 +114,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -180,8 +178,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_256,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -189,7 +185,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -259,8 +255,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -268,7 +262,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -334,8 +328,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_256,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -343,7 +335,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -413,8 +405,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_128,_192,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -422,12 +412,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -486,8 +476,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_128,_256,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -495,12 +483,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -561,8 +549,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_256,_192,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -570,12 +556,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -634,8 +620,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_256,_256,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -643,12 +627,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
index 7efb49d3..25143299 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
@@ -92,8 +92,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -101,12 +99,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -165,8 +163,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_256,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct CollectiveEpilogue
@@ -174,12 +170,12 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -244,8 +240,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -256,7 +250,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -321,8 +315,6 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
   using MmaTileShape_MNK = Shape<_256,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -333,7 +325,7 @@ TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f3
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
index e4b25135..6575e7c5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
@@ -104,8 +104,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstens
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   // Mma's accumulator type
   using ElementAccumulator = float;
@@ -130,12 +128,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstens
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized1Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -195,8 +193,6 @@ TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstens
   using MmaTileShape_MNK = Shape<_256,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   // Mma's accumulator type
   using ElementAccumulator = float;
@@ -220,12 +216,12 @@ TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstens
   //
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
       ElementD, GmemLayoutD, AlignD,                                        // D tensor description
-      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+      cutlass::epilogue::TmaWarpSpecialized2Sm                              // Epilogue schedule policy
     >::CollectiveOp;
 
   //
@@ -289,8 +285,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstens
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   //
   // Construct FusionOperation
@@ -310,7 +304,7 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstens
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -382,8 +376,6 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstens
   using MmaTileShape_MNK = Shape<_128,_128,_256>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
 
   // Mma's accumulator type
   using ElementAccumulator = float;
@@ -399,12 +391,12 @@ TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstens
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
-      PerSmTileShape_MNK, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC, AlignC,
       ElementD, GmemLayoutC, AlignD,
-      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
       FusionOperation
     >::CollectiveOp;
 
diff --git a/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
index 7928a697..b47812cc 100644
--- a/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
@@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_4,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_128,_64>;
+using ClusterShape = Shape<_4,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -263,17 +254,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -324,16 +312,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_4,_4,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_4,_4,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
index 08202e16..f58be119 100644
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
@@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -263,17 +254,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -324,16 +312,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
index fb325fac..36f31424 100644
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
@@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -263,16 +254,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -321,17 +309,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -344,7 +329,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -382,17 +367,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -405,7 +387,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -443,17 +425,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -466,7 +445,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -504,17 +483,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -527,7 +503,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -565,16 +541,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
index 8f1126f0..716f9773 100644
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
@@ -79,17 +79,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -102,7 +99,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -139,17 +136,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -162,7 +156,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -200,17 +194,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -223,7 +214,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -261,17 +252,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -322,16 +310,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -343,7 +328,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -380,17 +365,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -403,7 +385,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -441,17 +423,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -464,7 +443,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -502,17 +481,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -525,7 +501,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -563,17 +539,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -624,16 +597,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -645,7 +615,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
index d71c1bbc..b785f670 100644
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
@@ -79,17 +79,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -102,7 +99,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -139,17 +136,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -162,7 +156,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -200,17 +194,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -223,7 +214,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -261,17 +252,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -322,16 +310,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -343,7 +328,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -380,17 +365,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -403,7 +385,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -441,17 +423,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -464,7 +443,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -502,17 +481,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -525,7 +501,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -563,17 +539,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -586,7 +559,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -624,16 +597,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -645,7 +615,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
index 80a7c413..02088b82 100644
--- a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
@@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -263,16 +254,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -284,7 +272,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -322,17 +310,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -383,17 +368,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -406,7 +388,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -444,17 +426,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -467,7 +446,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -505,17 +484,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -528,7 +504,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -566,16 +542,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC *, AlignmentC,
@@ -587,7 +560,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA *, AlignmentA,
     ElementB, LayoutB *, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
index 66038626..24cc786c 100644
--- a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
@@ -80,17 +80,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -103,7 +100,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -141,17 +138,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -164,7 +158,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -202,17 +196,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -225,7 +216,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -263,17 +254,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -286,7 +274,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -324,16 +312,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -345,7 +330,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -383,17 +368,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_64,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_64,_128,_64>;
+using ClusterShape = Shape<_1,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -406,7 +388,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -444,17 +426,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_128,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_64,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -467,7 +446,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -505,17 +484,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_128,_64,_64>;
-using ClusterShape_MNK = Shape<_1,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_128,_32,_64>;
+using ClusterShape = Shape<_1,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -528,7 +504,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -566,17 +542,14 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_128,_64>;
-using ClusterShape_MNK = Shape<_2,_1,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_1,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -589,7 +562,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
@@ -627,16 +600,13 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // M
 using ElementAccumulator  = float;                                           // Element type for internal accumulation
 using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
-using TileShape_MNK = Shape<_256,_256,_64>;
-using ClusterShape_MNK = Shape<_2,_2,_1>;
-using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
-using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
-using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using MmaTileShape = Shape<_256,_128,_64>;
+using ClusterShape = Shape<_2,_2,_1>;
 using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
 using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-    OutputCtaShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
     ElementC, LayoutC, AlignmentC,
@@ -648,7 +618,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
     ElementA, LayoutA, AlignmentA,
     ElementB, LayoutB, AlignmentB,
     ElementAccumulator,
-    MmaTileShape, ClusterShape_MNK,
+    MmaTileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
     KernelSchedule
diff --git a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
index bd153825..043e0740 100644
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
@@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 512x256x256_4x2x
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_256,_256>;
+  using MmaTileShape = cute::Shape<_128,_128,_256>;
   using ClusterShape = Shape<_4,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -126,17 +123,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x384x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_384,_256>;
+  using MmaTileShape = cute::Shape<_128,_192,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 4,
@@ -179,17 +173,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x512x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using MmaTileShape = cute::Shape<_128,_256,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 4,
@@ -232,17 +223,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 256x256x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 4,
@@ -285,17 +273,14 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 512x768x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_256,_192,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 4,
diff --git a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
index 7c33f2da..921534e0 100644
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
@@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 512x256x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_256,_256>;
+  using MmaTileShape = Shape<_128,_128,_256>;
   using ClusterShape = Shape<_4,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -126,17 +123,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x384x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_384,_256>;
+  using MmaTileShape = cute::Shape<_128,_192,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -179,17 +173,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using MmaTileShape = cute::Shape<_128,_256,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -232,17 +223,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -285,17 +273,14 @@ TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_256,_192,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
diff --git a/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
index 35bf67c3..3947524d 100644
--- a/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
@@ -73,17 +73,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 128x128x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_128,_128,_256>;
+  using MmaTileShape = cute::Shape<_128,_128,_256>;
   using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -126,17 +123,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using MmaTileShape = cute::Shape<_128,_128,_256>;
   using ClusterShape = Shape<_2,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -179,17 +173,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 512x768x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_128,_192,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -232,17 +223,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 512x1024x256
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_1024,_256>;
+  using MmaTileShape = cute::Shape<_128,_256,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -285,17 +273,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
   using ClusterShape = Shape<_2,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -338,17 +323,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x512x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_512,_256>;
+  using MmaTileShape = cute::Shape<_256,_128,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -391,17 +373,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using MmaTileShape = cute::Shape<_256,_192,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
@@ -444,17 +423,14 @@ TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x1024x256
   using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
   using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
 
-  using ClusterTileShape = cute::Shape<_512,_1024,_256>;
+  using MmaTileShape = cute::Shape<_256,_256,_256>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 4,
diff --git a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
index 5183c937..671efa4f 100644
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
@@ -69,16 +69,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -118,16 +115,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 256x128x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -167,16 +161,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 512x512x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -216,16 +207,13 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_group, 128x128x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -265,16 +253,13 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -314,16 +299,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_group, 256x128x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -363,16 +345,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -412,16 +391,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC *, 16 / sizeof(ElementC),
@@ -461,16 +437,13 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using ElementD = cutlass::float_e4m3_t;
   using ElementAccumulator = float;
   using ElementCompute = float;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       void, LayoutD *, 16 / sizeof(ElementD),
diff --git a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
index 6969d001..24ab2c8a 100644
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
@@ -74,16 +74,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -124,16 +121,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -174,16 +168,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -224,16 +215,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 512x512x128
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -274,16 +262,13 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -324,16 +309,13 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_64,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_2,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -374,16 +356,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_2,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -424,16 +403,13 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_ptr_array, 512x512x128
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBias = cutlass::half_t;
-  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
diff --git a/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu b/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
index 0b18aea2..a0629fd2 100644
--- a/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
@@ -71,17 +71,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 128x64x128_1x1x1
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
   using ElementBias = int8_t;
-  using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -126,18 +122,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 512x128x128_4x2x
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
   using ElementBias = int8_t;
-  using ClusterTileShape = Shape<_512,_128,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_4,_2,_1>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -183,17 +174,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s32n_tensorop_1cta_s32_ptr_array, 64x256x128_1x1x
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
   using ElementBias = int32_t;
-  using ClusterTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_1,_1,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
@@ -239,17 +226,13 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4
   using ElementAccumulator = int32_t;
   using ElementCompute = float;
   using ElementBias = int8_t;
-  using ClusterTileShape = Shape<_128,_1024,Int<128 / sizeof(ElementA)>>;
+  using MmaTileShape = Shape<_128,_256,Int<128 / sizeof(ElementA)>>;
   using ClusterShape = Shape<_2,_4,_1>;
-  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
-
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
-  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
 
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape,
+      MmaTileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, LayoutC, 16 / sizeof(ElementC),
diff --git a/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
index a6bbcfce..d00774d9 100644
--- a/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@@ -74,10 +74,8 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using GmemLayoutA = cutlass::layout::RowMajor;
   using GmemLayoutB = cutlass::layout::ColumnMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -85,7 +83,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
@@ -132,10 +130,8 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using GmemLayoutA = cutlass::layout::ColumnMajor;
   using GmemLayoutB = cutlass::layout::RowMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -143,7 +139,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
@@ -190,10 +186,8 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using GmemLayoutA = cutlass::layout::RowMajor;
   using GmemLayoutB = cutlass::layout::RowMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -201,7 +195,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
@@ -248,10 +242,8 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
   using GmemLayoutA = cutlass::layout::ColumnMajor;
   using GmemLayoutB = cutlass::layout::ColumnMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -259,7 +251,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
diff --git a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
index 0ee4c2bb..8e3a43e6 100644
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
@@ -71,10 +71,8 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
   using GmemLayoutA = cutlass::layout::RowMajor;
   using GmemLayoutB = cutlass::layout::ColumnMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -82,7 +80,7 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC, 4,
@@ -126,10 +124,8 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
   using GmemLayoutA = cutlass::layout::ColumnMajor;
   using GmemLayoutB = cutlass::layout::RowMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -137,7 +133,7 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC, 4,
@@ -181,10 +177,8 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
   using GmemLayoutA = cutlass::layout::RowMajor;
   using GmemLayoutB = cutlass::layout::RowMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -192,7 +186,7 @@ TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC, 4,
@@ -236,10 +230,8 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
   using GmemLayoutA = cutlass::layout::ColumnMajor;
   using GmemLayoutB = cutlass::layout::ColumnMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -247,7 +239,7 @@ TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC, 4,
diff --git a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
index c6719954..63fbca6d 100644
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@@ -74,10 +74,8 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
   using GmemLayoutA = cutlass::layout::RowMajor;
   using GmemLayoutB = cutlass::layout::ColumnMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -85,7 +83,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
@@ -132,10 +130,8 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
   using GmemLayoutA = cutlass::layout::ColumnMajor;
   using GmemLayoutB = cutlass::layout::RowMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -143,7 +139,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
@@ -190,10 +186,8 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
   using GmemLayoutA = cutlass::layout::RowMajor;
   using GmemLayoutB = cutlass::layout::RowMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -201,7 +195,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
@@ -248,10 +242,8 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
   using GmemLayoutA = cutlass::layout::ColumnMajor;
   using GmemLayoutB = cutlass::layout::ColumnMajor;
   using GmemLayoutC = cutlass::layout::RowMajor;
-  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
-  using ClusterShape_MNK = Shape<_4,_4,_1>;
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
-  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
   
   //
   // Construct CollectiveEpilogue
@@ -259,7 +251,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
-      OutputCtaShape, ClusterShape_MNK,
+      MmaTileShape_MNK, ClusterShape_MNK,
       cutlass::epilogue::collective::EpilogueTileAuto,
       ElementAccumulator, ElementCompute,
       ElementC, GmemLayoutC *, 16,
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt b/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
index daed7ed7..6b4519d7 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/CMakeLists.txt
@@ -29,7 +29,7 @@
 #
 
 #
-
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
   cutlass_test_unit_gemm_device_sm100_tensorop
   DEPENDS
@@ -38,7 +38,7 @@ add_custom_target(
   cutlass_test_unit_gemm_device_tensorop_sm100_s8xs8
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm100_f16xf16
 
   BATCH_SOURCES ON
@@ -48,7 +48,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   f16_f16_f16_f16_fusion.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm100_f8xf8
 
   BATCH_SOURCES ON
@@ -58,7 +58,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   f8_f8_f16_f8_fusion.cu
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm100_s8xs8
 
   BATCH_SOURCES ON
@@ -67,5 +67,6 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   s8_s8_void_s32.cu
   s8_s8_s32_s32_fusion.cu
 )
+endif()
 
 add_subdirectory(narrow_precision) 
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_f16_f16_fusion.cu b/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_f16_f16_fusion.cu
index 3ebe73c2..577ca990 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_f16_f16_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_f16_f16_fusion.cu
@@ -88,8 +88,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * acc + beta * C + per-row bias
@@ -108,7 +106,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -173,8 +171,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * acc + beta * C + per-row bias
@@ -193,7 +189,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -264,8 +260,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * acc + beta * C + per-row bias
@@ -290,7 +284,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -355,8 +349,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * acc + beta * C + per-row bias
@@ -380,7 +372,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -451,8 +443,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // dY = alpha * acc + beta * C
@@ -476,7 +466,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -541,8 +531,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // dY = alpha * acc + beta * C
@@ -566,7 +554,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_f16t_f16t_tensor_op_f32, 128x128x64_1x2x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32.cu b/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32.cu
index 8d8c71b9..ead0b789 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f16_f16_void_f32.cu
@@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 64x64x64_4x1x1_1sm
   using MmaTileShape_MNK = Shape<_64,_64,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_64>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -94,7 +92,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 64x64x64_4x1x1_1sm
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -159,8 +157,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32t_tensor_op_f32, 64x128x64_1x4x1_1s
   using MmaTileShape_MNK = Shape<_64,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_64>;
 
   //
   // Construct CollectiveEpilogue
@@ -168,7 +164,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32t_tensor_op_f32, 64x128x64_1x4x1_1s
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32t_tensor_op_f32, 128x64x64_1x8x1_st
   using MmaTileShape_MNK = Shape<_128,_64,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_64>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -244,7 +238,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32t_tensor_op_f32, 128x64x64_1x8x1_st
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -309,8 +303,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 128x128x64_2x8x1_1
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   //
   // Construct CollectiveEpilogue
@@ -318,7 +310,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 128x128x64_2x8x1_1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 128x64x64_2x4x1_2s
   using MmaTileShape_MNK = Shape<_128,_64,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_64>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -395,7 +385,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16t_void_f32n_tensor_op_f32, 128x64x64_2x4x1_2s
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -461,8 +451,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32n_tensor_op_f32, 128x128x64_16x1x1_
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_16,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_64>;
 
   //
   // Construct CollectiveEpilogue
@@ -470,7 +458,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16n_void_f32n_tensor_op_f32, 128x128x64_16x1x1_
 
     using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32n_tensor_op_f32, 256x64x64_4x1x1) {
   using MmaTileShape_MNK = Shape<_256,_64,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_64>;
 
   //
   // Construct CollectiveEpilogue
@@ -543,7 +529,7 @@ TEST(SM100Only_Device_Gemm_f16n_f16n_void_f32n_tensor_op_f32, 256x64x64_4x1x1) {
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -607,8 +593,6 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 256x256x64_2x1x1)
   using MmaTileShape_MNK = Shape<_256,_256,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_64>;
 
   //
   // Construct CollectiveEpilogue
@@ -616,7 +600,7 @@ TEST(SM100Only_Device_Gemm_f16t_f16t_void_f32n_tensor_op_f32, 256x256x64_2x1x1)
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
index 70b70111..3fdf7f51 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_f16_f8_fusion.cu
@@ -88,8 +88,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@@ -108,7 +106,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -173,8 +171,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@@ -194,7 +190,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -265,8 +261,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@@ -294,7 +288,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_e4m3t_tensor_op_f32, 128x128x128_1x2
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -359,8 +353,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
   using MmaTileShape_MNK = Shape<_128,_128,_64>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_64>;
 
   // Epilogue fusion operation
   // Z = alpha * scale_a * scale_b * acc + beta * scale_c * C + per-row bias
@@ -388,7 +380,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f16t_f32t_tensor_op_f32, 128x128x128_1x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_f32.cu b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_f32.cu
index 8cebebe5..c137951a 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_f32.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/f8_f8_void_f32.cu
@@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
   using MmaTileShape_MNK = Shape<_64,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -91,7 +89,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -155,8 +153,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e5m2n_void_f32t_tensor_op_f32, 64x128x128_1x4x1
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -167,7 +163,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e5m2n_void_f32t_tensor_op_f32, 64x128x128_1x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3n_void_f32t_tensor_op_f32, 128x64x128_1x8x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -241,7 +235,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3n_void_f32t_tensor_op_f32, 128x64x128_1x8x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -305,8 +299,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x8x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -317,7 +309,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x8x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_2x4x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -392,7 +382,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_2x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -457,8 +447,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_void_f32n_tensor_op_f32, 128x128x128_16x1
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_16,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -469,7 +457,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_void_f32n_tensor_op_f32, 128x128x128_16x1
 
     using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -543,7 +529,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -607,8 +593,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x1x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -619,7 +603,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
index 6d69be9b..884729ec 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/CMakeLists.txt
@@ -29,7 +29,7 @@
 #
 
 #
-
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
 add_custom_target(
   cutlass_test_unit_gemm_device_sm100_tensorop_narrow_precision
   DEPENDS
@@ -38,7 +38,7 @@ add_custom_target(
   cutlass_test_unit_gemm_device_tensorop_sm100_f8xf6f4
 )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm100_f6f4xf6f4
 
   BATCH_SOURCES ON
@@ -50,7 +50,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   f6f4_f6f4_void_f32_tt_layout.cu
   )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm100_f6f4xf8
 
   BATCH_SOURCES ON
@@ -60,7 +60,7 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   f6f4_f8_void_f32_nt_layout.cu
   )
 
-cutlass_test_unit_gemm_device_add_executable_split_file(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm100_f8xf6f4
 
   BATCH_SOURCES ON
@@ -69,3 +69,4 @@ cutlass_test_unit_gemm_device_add_executable_split_file(
   f8_f6f4_void_f32_tn_layout.cu
   f8_f6f4_void_f32_nt_layout.cu
   )
+endif()
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nn_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nn_layout.cu
index 2e1199ed..29cb90f1 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nn_layout.cu
@@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -124,7 +122,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -189,8 +187,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -198,7 +194,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -274,7 +268,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -339,8 +333,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -348,7 +340,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -412,8 +404,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e3m2n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -424,7 +414,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e3m2n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -489,8 +479,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -498,7 +486,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -562,8 +550,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -574,7 +560,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -639,8 +625,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -648,7 +632,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nt_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nt_layout.cu
index 78ac8975..1051048e 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_nt_layout.cu
@@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3t_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m3t_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tn_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tn_layout.cu
index 6de91c7f..7a11c5ab 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tn_layout.cu
@@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
   using MmaTileShape_MNK = Shape<_64,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
   using MmaTileShape_MNK = Shape<_64,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -335,8 +329,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
   using MmaTileShape_MNK = Shape<_64,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -344,7 +336,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -408,8 +400,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -420,7 +410,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -485,8 +475,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -494,7 +482,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -558,8 +546,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -570,7 +556,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -635,8 +621,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -644,7 +628,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -708,8 +692,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -720,7 +702,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -785,8 +767,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -794,7 +774,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -858,8 +838,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -870,7 +848,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -935,8 +913,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -944,7 +920,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1008,8 +984,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -1020,7 +994,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1085,8 +1059,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e3m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1094,7 +1066,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e3m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1158,8 +1130,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x192x128_2x4x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -1170,7 +1140,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_void_f32n_tensor_op_f32, 256x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1235,8 +1205,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1244,7 +1212,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e3m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tt_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tt_layout.cu
index 23ea1e75..41d5a95b 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f6f4_void_f32_tt_layout.cu
@@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
   using MmaTileShape_MNK = Shape<_64,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e2m3t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m1t_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e2m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_nt_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_nt_layout.cu
index 38f838ba..12f89717 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_nt_layout.cu
@@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e5m2t_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e3m2n_e5m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e3m2n_e5m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e5m2t_void_f32n_tensor_op_f32, 256x128x128_2x1x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e5m2t_void_f32n_tensor_op_f32, 256x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -561,8 +549,6 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x192x128_2x4x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -573,7 +559,7 @@ TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_void_f32n_tensor_op_f32, 256x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -638,8 +624,6 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -647,7 +631,7 @@ TEST(SM100Only_Device_Gemm_e2m3n_e4m3t_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_tn_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_tn_layout.cu
index 9a075b22..d9c24373 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f6f4_f8_void_f32_tn_layout.cu
@@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
   using MmaTileShape_MNK = Shape<_64,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -121,7 +119,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -185,8 +183,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -263,8 +259,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
   using MmaTileShape_MNK = Shape<_64,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -272,7 +266,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -336,8 +330,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
   using MmaTileShape_MNK = Shape<_64,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -345,7 +337,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -409,8 +401,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -421,7 +411,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -486,8 +476,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -495,7 +483,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -559,8 +547,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -571,7 +557,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -636,8 +622,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -645,7 +629,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -709,8 +693,6 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -721,7 +703,7 @@ TEST(SM100Only_Device_Gemm_e2m3t_e4m3n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -786,8 +768,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -795,7 +775,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -859,8 +839,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -868,7 +846,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -932,8 +910,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -941,7 +917,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1005,8 +981,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -1017,7 +991,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1082,8 +1056,6 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1091,7 +1063,7 @@ TEST(SM100Only_Device_Gemm_e3m2t_e5m2n_void_f32n_tensor_op_f32, 256x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1155,8 +1127,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -1167,7 +1137,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1232,8 +1202,6 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1241,7 +1209,7 @@ TEST(SM100Only_Device_Gemm_e2m1t_e5m2n_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_nt_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_nt_layout.cu
index dbc55fe9..f879d772 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_nt_layout.cu
@@ -111,8 +111,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m3t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -123,7 +121,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m3t_void_f32n_tensor_op_f32, 64x128x128_2x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -188,8 +186,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e3m2t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
   using MmaTileShape_MNK = Shape<_64,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -197,7 +193,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e3m2t_void_f32n_tensor_op_f32, 64x256x128_2x2x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -261,8 +257,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -273,7 +267,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e2m1t_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -338,8 +332,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m3t_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m3t_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -411,8 +403,6 @@ TEST(SM100Only_Device_Gemm_e4m3n_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -423,7 +413,7 @@ TEST(SM100Only_Device_Gemm_e4m3n_e3m2t_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -488,8 +478,6 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e5m2n_e2m1t_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_tn_layout.cu b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_tn_layout.cu
index 52d33db4..42ec9779 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/narrow_precision/f8_f6f4_void_f32_tn_layout.cu
@@ -112,8 +112,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
   using MmaTileShape_MNK = Shape<_64,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -124,7 +122,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x64x128_4x1x1_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -189,8 +187,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -198,7 +194,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 64x128x128_2x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -262,8 +258,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
   using MmaTileShape_MNK = Shape<_64,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -271,7 +265,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 64x192x128_2x4x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -335,8 +329,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
   using MmaTileShape_MNK = Shape<_64,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -347,7 +339,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 64x256x128_2x2x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -412,8 +404,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -421,7 +411,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -485,8 +475,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -497,7 +485,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -562,8 +550,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -571,7 +557,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e3m2n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -635,8 +621,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -647,7 +631,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -712,8 +696,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -721,7 +703,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -785,8 +767,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -797,7 +777,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m3n_void_f32n_tensor_op_f32, 128x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -862,8 +842,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
   using MmaTileShape_MNK = Shape<_128,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -871,7 +849,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 128x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -935,8 +913,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
   using MmaTileShape_MNK = Shape<_128,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -947,7 +923,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e3m2n_void_f32n_tensor_op_f32, 128x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1012,8 +988,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1021,7 +995,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x64x128_4x1x1
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1085,8 +1059,6 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
   using MmaTileShape_MNK = Shape<_256,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1094,7 +1066,7 @@ TEST(SM100Only_Device_Gemm_e5m2t_e2m1n_void_f32n_tensor_op_f32, 256x128x128_2x1x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1158,8 +1130,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
   using MmaTileShape_MNK = Shape<_256,_192,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -1167,7 +1137,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m3n_void_f32n_tensor_op_f32, 256x192x128_2x4x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -1231,8 +1201,6 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -1243,7 +1211,7 @@ TEST(SM100Only_Device_Gemm_e4m3t_e2m1n_void_f32n_tensor_op_f32, 256x256x128_2x2x
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_s32_s32_fusion.cu b/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_s32_s32_fusion.cu
index 09d31e57..21c614e0 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_s32_s32_fusion.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_s32_s32_fusion.cu
@@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   // Epilogue fusion operation
   // Z = per-row alpha * acc + per-row beta * C + per-row bias
@@ -101,7 +99,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
   //
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -166,8 +164,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_2,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   // Epilogue fusion operation
   // Z = per-col alpha * acc + per-col beta * C + per-col bias
@@ -185,7 +181,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_s32t_s32t_tensor_op_f32, 128x128x128_1x2x1_1s
   //
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_void_s32.cu b/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_void_s32.cu
index 3ee097a5..a4b8aacb 100644
--- a/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_void_s32.cu
+++ b/test/unit/gemm/device/sm100_tensorop_gemm/s8_s8_void_s32.cu
@@ -82,8 +82,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 64x64x128_4x1x1_1sm_
   using MmaTileShape_MNK = Shape<_64,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -94,7 +92,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 64x64x128_4x1x1_1sm_
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -159,8 +157,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32t_tensor_op_f32, 64x128x128_1x4x1_1sm
   using MmaTileShape_MNK = Shape<_64,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -168,7 +164,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32t_tensor_op_f32, 64x128x128_1x4x1_1sm
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -232,8 +228,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32t_tensor_op_f32, 128x64x128_1x8x1_str
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_1,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -244,7 +238,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32t_tensor_op_f32, 128x64x128_1x8x1_str
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -309,8 +303,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 128x128x128_2x8x1_1s
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_8,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -318,7 +310,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 128x128x128_2x8x1_1s
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -383,8 +375,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 128x64x128_2x4x1_2sm
   using MmaTileShape_MNK = Shape<_128,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_4,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -395,7 +385,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8t_void_s32n_tensor_op_f32, 128x64x128_2x4x1_2sm
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -461,8 +451,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32n_tensor_op_f32, 128x128x128_16x1x1_2
   using MmaTileShape_MNK = Shape<_128,_128,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_16,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_64,_128,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -470,7 +458,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8n_void_s32n_tensor_op_f32, 128x128x128_16x1x1_2
 
     using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -534,8 +522,6 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32n_tensor_op_f32, 256x64x128_4x1x1_str
   using MmaTileShape_MNK = Shape<_256,_64,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_4,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_64,_128>;
 
   // Tile Scheduler
   using TileScheduler = cutlass::gemm::StreamKScheduler;
@@ -546,7 +532,7 @@ TEST(SM100Only_Device_Gemm_s8n_s8n_void_s32n_tensor_op_f32, 256x64x128_4x1x1_str
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
@@ -611,8 +597,6 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 256x256x128_2x1x1) {
   using MmaTileShape_MNK = Shape<_256,_256,_128>;
   // Cluster size for multicast
   using ClusterShape_MNK = Shape<_2,_1,_1>;
-  // Collective Epilogue takes the output tile shape for 1 CTA
-  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
 
   //
   // Construct CollectiveEpilogue
@@ -620,7 +604,7 @@ TEST(SM100Only_Device_Gemm_s8t_s8t_void_s32n_tensor_op_f32, 256x256x128_2x1x1) {
 
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,                 // Arch and Tensorop spec
-      PerSmTileShape_MNK, ClusterShape_MNK,                                 // Epilogue tile shape, and cluster shape
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
       cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
       ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
       ElementC, GmemLayoutC, AlignC,                                        // C tensor description
diff --git a/test/unit/pipeline/CMakeLists.txt b/test/unit/pipeline/CMakeLists.txt
index 230734a1..5bc12223 100644
--- a/test/unit/pipeline/CMakeLists.txt
+++ b/test/unit/pipeline/CMakeLists.txt
@@ -26,12 +26,21 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_pipeline
+set(PIPELINE_SOURCES
   pipeline_tma_async.cu
   pipeline_tma_async_warp_specialized.cu
   pipeline_tma_async_warp_specialized_persistent.cu
-  pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu 
   pipeline_async.cu
   sequence_barrier.cu
 )
+
+if (CUTLASS_NVCC_ARCHS MATCHES 100a)
+  list(APPEND PIPELINE_SOURCES 
+    pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
+  )
+endif()
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_pipeline
+  ${PIPELINE_SOURCES}
+)
diff --git a/tools/library/include/cutlass/library/descriptions.h b/tools/library/include/cutlass/library/descriptions.h
index fb56b027..cddb51c8 100644
--- a/tools/library/include/cutlass/library/descriptions.h
+++ b/tools/library/include/cutlass/library/descriptions.h
@@ -35,6 +35,8 @@
 #include <cutlass/blas3_types.h>
 #include <cutlass/gemm_coord.h>
 
+#include <optional>
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -300,13 +302,34 @@ struct GemmDescription : public OperationDescription {
     transform_B(transform_B) {}
 };
 
+struct BlockScaleDescription {
+  /// Describes the SFA operand
+  TensorDescription SFA;
+
+  /// Describes the SFB operand
+  TensorDescription SFB;
+
+  /// Describes the SFD operand
+  TensorDescription SFD;
+
+  /// Describes the input ScaleFactor VectorSize
+  int SFVecSize;
+
+  /// Describes the Output ScaleFactor VectorSize
+  int EpilogueSFVecSize;
+};
+
+struct GroupedGemmDescription : public OperationDescription {
+  GemmDescription gemm;
+  std::optional<BlockScaleDescription> block_scales;
+};
 
 /// Description of all GEMM computations
 struct BlockScaledGemmDescription : public OperationDescription {
 
   /// Indicates the kind of GEMM performed
   GemmKind gemm_kind;
-  
+
   /// Describes the A operand
   TensorDescription A;
 
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index 0aea3126..e279f1fd 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -336,11 +336,10 @@ struct GemmUniversalArguments {
   void *packed_Scale{nullptr};          // Packed scale for int4 * fp8
 
   int device_index{0};
-  
+
   bool use_pdl{false};
 };
 
-
 /// Block Scaled GEMM
 //
 // OperationKind: kBlockScaledGemm
@@ -495,29 +494,31 @@ struct GemmGroupedConfiguration {
   int64_t* lda;
   int64_t* ldb;
   int64_t* ldc;
+
+  cute::Shape<int, int, int>* problem_sizes_3x_host;
 };
 
 struct GemmGroupedArguments {
   int problem_count{};
   gemm::GemmCoord* problem_sizes{nullptr};
 
-  void * ptr_A{nullptr};
-  void * ptr_B{nullptr};
-  void * ptr_C{nullptr};
-  void * ptr_D{nullptr};
+  void* ptr_A{nullptr};
+  void* ptr_B{nullptr};
+  void* ptr_C{nullptr};
+  void* ptr_D{nullptr};
 
-  int64_t *lda{nullptr};
-  int64_t *ldb{nullptr};
-  int64_t *ldc{nullptr};
-  int64_t *ldd{nullptr};
+  int64_t* lda{nullptr};
+  int64_t* ldb{nullptr};
+  int64_t* ldc{nullptr};
+  int64_t* ldd{nullptr};
 
   void const *alpha{nullptr};
   void const *beta{nullptr};
   ScalarPointerMode pointer_mode{};
   bool use_pdl{false};
 
-  gemm::GemmCoord cluster_shape{};          
-  gemm::GemmCoord cluster_shape_fallback{}; 
+  gemm::GemmCoord cluster_shape{};
+  gemm::GemmCoord cluster_shape_fallback{};
 
   // these should really be in the configuration but staying consistent with GEMM
   int sm_count{0};
@@ -529,6 +530,13 @@ struct GemmGroupedArguments {
   cute::Shape<int, int, int>* problem_sizes_3x_host;
 };
 
+struct GroupedGemmBlockScaledArguments : GemmGroupedArguments {
+  void* SFA{nullptr};
+  void* SFB{nullptr};
+  void* SFD{nullptr};
+  void* norm_constant{nullptr};
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // OperationKind: kSparseGemm
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index 92f62bf4..880cb4bf 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1200,13 +1200,24 @@ public:
   GemmGroupedOperation(char const *name = "unknown_gemm"):
     GemmOperationBase<Operator_>(name) {
 
-    this->description_.gemm_kind = GemmKind::kGrouped;
     this->description_.kind = OperationKind::kGroupedGemm;
+    this->description_.provider = Provider::kCUTLASS;
     this->threadblock_count = Operator::sufficient();
+
+    this->description_.gemm = GemmOperationBase<Operator_>::description_;
+    this->description_.gemm.gemm_kind = GemmKind::kGrouped;
+    this->description_.tile_description = this->description_.gemm.tile_description;
   }
 
+  /// Returns the description of the GroupedGEMM operation
+  virtual OperationDescription const & description() const override final {
+    return description_;
+  }
+
+
 private:
   int threadblock_count;
+  GroupedGemmDescription description_;
 
 protected:
 
diff --git a/tools/library/src/grouped_gemm_operation_3x.hpp b/tools/library/src/grouped_gemm_operation_3x.hpp
index a07ce63b..c21c82c7 100644
--- a/tools/library/src/grouped_gemm_operation_3x.hpp
+++ b/tools/library/src/grouped_gemm_operation_3x.hpp
@@ -41,17 +41,11 @@
 #include "cutlass/library/util.h"
 #include "gemm_operation_3x.hpp"
 #include "library_internal.h"
-#include <unordered_map>
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::library {
 
-/// **** CAUTION ****
-/// Unlike other operations, initialize() must be called when
-/// certain arguments change. See initialize() for details.
 template <typename Operator_>
-class GroupedGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+class GroupedGemmOperation3xBase : public GemmOperation3xBase<Operator_> {
 public:
   using Operator = Operator_;
   using OperatorArguments = typename Operator::Arguments;
@@ -70,20 +64,15 @@ public:
   using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
   using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
 
-private:
-  mutable CudaBuffer strideA_device;
-  mutable CudaBuffer strideB_device;
-  mutable CudaBuffer strideC_device;
-  mutable CudaBuffer strideD_device;
-  mutable std::vector<typename Operator::GemmKernel::InternalStrideA> strideA_host;
-  mutable std::vector<typename Operator::GemmKernel::InternalStrideB> strideB_host;
-  mutable std::vector<typename Operator::GemmKernel::InternalStrideC> strideC_host;
-  mutable std::vector<typename Operator::GemmKernel::InternalStrideD> strideD_host;
-
-public:
-  GroupedGemmUniversal3xOperation(char const* name = "unknown_gemm")
+  GroupedGemmOperation3xBase(char const* name = "unknown_gemm")
       : GemmOperation3xBase<Operator_>(name, GemmKind::kGrouped) {
     this->description_.kind = OperationKind::kGroupedGemm;
+    this->description_.name = name;
+    this->description_.provider = Provider::kCUTLASS;
+
+    this->description_.gemm = GemmOperation3xBase<Operator_>::description_;
+    this->description_.tile_description = this->description_.gemm.tile_description;
+
     if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
       dim3 cluster_dims(
         cute::size<0>(typename Operator::GemmKernel::ClusterShape{}),
@@ -96,8 +85,157 @@ public:
         threads_per_block,
         kernel_ptr);
     }
+  };
+
+public:
+  mutable CudaBuffer strideA_device;
+  mutable CudaBuffer strideB_device;
+  mutable CudaBuffer strideC_device;
+  mutable CudaBuffer strideD_device;
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const& description() const override final { return description_; }
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const* configuration) const override final {
+    return sizeof(Operator);
   }
 
+protected:
+  library::GroupedGemmDescription description_;
+  int max_active_clusters;
+
+  Status initialize_strides(GemmGroupedConfiguration const& config) const {
+    auto const num_groups = config.problem_count;
+    this->strideA_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideA) * num_groups);
+    this->strideB_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideB) * num_groups);
+    this->strideC_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideC) * num_groups);
+    this->strideD_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideD) * num_groups);
+
+    std::vector<typename Operator::GemmKernel::InternalStrideA> strideA_host(num_groups);
+    std::vector<typename Operator::GemmKernel::InternalStrideB> strideB_host(num_groups);
+    std::vector<typename Operator::GemmKernel::InternalStrideC> strideC_host(num_groups);
+    std::vector<typename Operator::GemmKernel::InternalStrideD> strideD_host(num_groups);
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+      strideA_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideA>(
+          config.lda[group_idx]);
+      strideB_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideB>(
+          config.ldb[group_idx]);
+      strideC_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideC>(
+          config.ldc[group_idx]);
+      strideD_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideD>(
+          config.ldc[group_idx]);
+    }
+    CUDA_CHECK(cudaMemcpy(
+      this->strideA_device.data(),
+      strideA_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideA) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->strideB_device.data(),
+      strideB_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideB) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->strideC_device.data(),
+      strideC_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideC) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->strideD_device.data(),
+      strideD_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideD) * num_groups,
+      cudaMemcpyHostToDevice));
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  Status update_arguments_base(
+    OperatorArguments& operator_args,
+    GemmGroupedArguments const& arguments) const {
+    operator_args.mode = cutlass::gemm::GemmUniversalMode::kGrouped;
+    operator_args.problem_shape = {
+      arguments.problem_count,
+      arguments.problem_sizes_3x,
+      arguments.pointer_mode == ScalarPointerMode::kHost ? arguments.problem_sizes_3x_host
+                                                         : nullptr};
+    operator_args.mainloop.ptr_A = static_cast<ElementA const**>(arguments.ptr_A);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const**>(arguments.ptr_B);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const**>(arguments.ptr_C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD**>(arguments.ptr_D);
+
+    operator_args.mainloop.dA =
+      static_cast<typename Operator::GemmKernel::InternalStrideA*>(this->strideA_device.data());
+    operator_args.mainloop.dB =
+      static_cast<typename Operator::GemmKernel::InternalStrideB*>(this->strideB_device.data());
+    operator_args.epilogue.dC =
+      static_cast<typename Operator::GemmKernel::InternalStrideC*>(this->strideC_device.data());
+    operator_args.epilogue.dD =
+      static_cast<typename Operator::GemmKernel::InternalStrideD*>(this->strideD_device.data());
+
+    operator_args.hw_info.sm_count = arguments.sm_count;
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      operator_args.hw_info.max_active_clusters = max_active_clusters;
+    }
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape =
+        dim3(arguments.cluster_shape.m(), arguments.cluster_shape.n(), arguments.cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments.cluster_shape_fallback.m(),
+        arguments.cluster_shape_fallback.n(),
+        arguments.cluster_shape_fallback.k());
+    }
+    return Status::kSuccess;
+  }
+
+  template <typename FusionArgs>
+  static Status update_fusion_args(FusionArgs& fusion_args, GemmGroupedArguments const& arguments) {
+    if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+      fusion_args.alpha = *static_cast<ElementCompute const*>(arguments.alpha);
+      fusion_args.beta = *static_cast<ElementCompute const*>(arguments.beta);
+      fusion_args.alpha_ptr = nullptr;
+      fusion_args.beta_ptr = nullptr;
+      fusion_args.alpha_ptr_array = nullptr;
+      fusion_args.beta_ptr_array = nullptr;
+      // Single alpha and beta for all groups
+      fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0};
+      fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0};
+
+      return Status::kSuccess;
+    }
+    else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+      fusion_args.alpha = 0;
+      fusion_args.beta = 0;
+      fusion_args.alpha_ptr = static_cast<ElementCompute const*>(arguments.alpha);
+      fusion_args.beta_ptr = static_cast<ElementCompute const*>(arguments.beta);
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+  }
+};
+
+/// **** CAUTION ****
+/// Unlike other operations, initialize() must be called when
+/// certain arguments change. See initialize() for details.
+template <typename Operator_>
+class GroupedGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+  GroupedGemmUniversal3xOperation(char const* name = "unknown_gemm")
+      : GroupedGemmOperation3xBase<Operator_>(name) {}
+
   ~GroupedGemmUniversal3xOperation() override = default;
 
 private:
@@ -115,29 +253,7 @@ protected:
   template <class FusionArgs>
   struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
     static Status update_(FusionArgs& fusion_args, GemmGroupedArguments const& arguments) {
-      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
-        fusion_args.alpha = *static_cast<ElementCompute const*>(arguments.alpha);
-        fusion_args.beta = *static_cast<ElementCompute const*>(arguments.beta);
-        fusion_args.alpha_ptr = nullptr;
-        fusion_args.beta_ptr = nullptr;
-        fusion_args.alpha_ptr_array = nullptr;
-        fusion_args.beta_ptr_array = nullptr;
-        // Single alpha and beta for all groups
-        fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0};
-        fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0};
-
-        return Status::kSuccess;
-      }
-      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
-        fusion_args.alpha = 0;
-        fusion_args.beta = 0;
-        fusion_args.alpha_ptr = static_cast<ElementCompute const*>(arguments.alpha);
-        fusion_args.beta_ptr = static_cast<ElementCompute const*>(arguments.beta);
-        return Status::kSuccess;
-      }
-      else {
-        return Status::kErrorInvalidProblem;
-      }
+      return GroupedGemmOperation3xBase<Operator>::update_fusion_args(fusion_args, arguments);
     }
   };
 
@@ -152,46 +268,7 @@ protected:
       return status;
     }
 
-    operator_args.mode = cutlass::gemm::GemmUniversalMode::kGrouped;
-    operator_args.problem_shape = {
-      arguments->problem_count,
-      arguments->problem_sizes_3x,
-      arguments->pointer_mode == ScalarPointerMode::kHost ? arguments->problem_sizes_3x_host
-                                                          : nullptr};
-    operator_args.mainloop.ptr_A =
-      static_cast<const typename Operator::ElementA**>(arguments->ptr_A);
-    operator_args.mainloop.ptr_B =
-      static_cast<const typename Operator::ElementB**>(arguments->ptr_B);
-    operator_args.epilogue.ptr_C =
-      static_cast<const typename Operator::ElementC**>(arguments->ptr_C);
-    operator_args.epilogue.ptr_D = static_cast<typename Operator::ElementD**>(arguments->ptr_D);
-
-    operator_args.mainloop.dA =
-      static_cast<typename Operator::GemmKernel::InternalStrideA*>(strideA_device.data());
-    operator_args.mainloop.dB =
-      static_cast<typename Operator::GemmKernel::InternalStrideB*>(strideB_device.data());
-    operator_args.epilogue.dC =
-      static_cast<typename Operator::GemmKernel::InternalStrideC*>(strideC_device.data());
-    operator_args.epilogue.dD =
-      static_cast<typename Operator::GemmKernel::InternalStrideD*>(strideD_device.data());
-
-    operator_args.hw_info.sm_count = arguments->sm_count;
-    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
-      operator_args.hw_info.max_active_clusters = max_active_clusters;
-    }
-    
-    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
-      operator_args.hw_info.cluster_shape = dim3(
-        arguments->cluster_shape.m(),
-        arguments->cluster_shape.n(),
-        arguments->cluster_shape.k());
-      operator_args.hw_info.cluster_shape_fallback = dim3(
-        arguments->cluster_shape_fallback.m(),
-        arguments->cluster_shape_fallback.n(),
-        arguments->cluster_shape_fallback.k());
-    }
-    
-
+    status = this->update_arguments_base(operator_args, *arguments);
     return status;
   }
 
@@ -201,7 +278,6 @@ public:
     const override {
     GemmGroupedArguments const* arguments = static_cast<GemmGroupedArguments const*>(arguments_ptr);
     OperatorArguments args;
-
     auto status = update_arguments_(args, arguments);
     if (status != Status::kSuccess) {
       return status;
@@ -211,11 +287,6 @@ public:
     return status;
   }
 
-  /// Gets the host-side workspace
-  uint64_t get_host_workspace_size(void const* configuration) const override {
-    return sizeof(Operator);
-  }
-
   /// Gets the device-side workspace
   uint64_t get_device_workspace_size(void const* configuration_ptr, void const* arguments_ptr)
     const override {
@@ -246,59 +317,10 @@ public:
     void* device_workspace,
     cudaStream_t stream = nullptr) const override {
 
-    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
-
-    auto num_groups = config.problem_count;
-    strideA_device =
-      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideA) * num_groups);
-    strideB_device =
-      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideB) * num_groups);
-    strideC_device =
-      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideC) * num_groups);
-    strideD_device =
-      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideD) * num_groups);
-
-    strideA_host.resize(num_groups);
-    strideB_host.resize(num_groups);
-    strideC_host.resize(num_groups);
-    strideD_host.resize(num_groups);
-    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
-      strideA_host[group_idx] =
-        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideA>(
-          config.lda[group_idx]);
-      strideB_host[group_idx] =
-        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideB>(
-          config.ldb[group_idx]);
-      strideC_host[group_idx] =
-        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideC>(
-          config.ldc[group_idx]);
-      strideD_host[group_idx] =
-        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideD>(
-          config.ldc[group_idx]);
-    }
-    CUDA_CHECK(cudaMemcpy(
-      strideA_device.data(),
-      strideA_host.data(),
-      sizeof(typename Operator::GemmKernel::InternalStrideA) * num_groups,
-      cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(
-      strideB_device.data(),
-      strideB_host.data(),
-      sizeof(typename Operator::GemmKernel::InternalStrideB) * num_groups,
-      cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(
-      strideC_device.data(),
-      strideC_host.data(),
-      sizeof(typename Operator::GemmKernel::InternalStrideC) * num_groups,
-      cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(
-      strideD_device.data(),
-      strideD_host.data(),
-      sizeof(typename Operator::GemmKernel::InternalStrideD) * num_groups,
-      cudaMemcpyHostToDevice));
-
     Operator* op = new (host_workspace) Operator;
-    return Status::kSuccess;
+
+    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
+    return this->initialize_strides(config);
   }
 
   /// **** CAUTION ****
@@ -323,8 +345,215 @@ public:
     return status;
   }
 };
-///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GroupedBlockScaledGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using ElementSFA = typename Operator::CollectiveMainloop::ElementSF;
+  using ElementSFB = typename Operator::CollectiveMainloop::ElementSF;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+  constexpr static int SFVecSize = TiledMma::SFVecSize;
+
+
+  static constexpr bool epilogue_scalefactor_generation = not cute::is_same_v<typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
+  static constexpr int32_t SFD_VectorSize = epilogue_scalefactor_generation ? ThreadEpilogueOp::SFVecSize : SFVecSize;
+  using ElementSFD = cute::conditional_t<epilogue_scalefactor_generation, typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
+  using LayoutSFD = cute::conditional_t<epilogue_scalefactor_generation, typename ThreadEpilogueOp::GmemLayoutTagScalefactor, LayoutD>; 
+
+  GroupedBlockScaledGemmUniversal3xOperation(char const* name = "unknown_gemm")
+      : GroupedGemmOperation3xBase<Operator_>(name) {
+
+    BlockScaleDescription block_scaled_desc{};
+    block_scaled_desc.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    block_scaled_desc.SFA.layout = LayoutTypeID::kRowMajor;
+    block_scaled_desc.SFA.alignment = 128;
+    block_scaled_desc.SFA.log_extent_range = 32;
+    block_scaled_desc.SFA.log_stride_range = 32;
+
+    block_scaled_desc.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    block_scaled_desc.SFB.layout = LayoutTypeID::kRowMajor;
+    block_scaled_desc.SFB.alignment = 128;
+    block_scaled_desc.SFB.log_extent_range = 32;
+    block_scaled_desc.SFB.log_stride_range = 32;
+
+    block_scaled_desc.SFVecSize = SFVecSize;
+
+    block_scaled_desc.SFD = make_TensorDescription<ElementSFD, LayoutSFD>(128);
+    block_scaled_desc.EpilogueSFVecSize = SFD_VectorSize;
+
+    this->description_.block_scales = block_scaled_desc;
+  }
+
+  ~GroupedBlockScaledGemmUniversal3xOperation() override = default;
+
+  mutable CudaBuffer layout_SFA_device;
+  mutable CudaBuffer layout_SFB_device;
+
+protected:
+  template <class FusionArgs, class = void> struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmGroupedArguments const& arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template <class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status
+    update_(FusionArgs& fusion_args, GroupedGemmBlockScaledArguments const& arguments) {
+
+      if constexpr (epilogue_scalefactor_generation) {
+        fusion_args.block_scale_factor_ptr = static_cast<ElementSFD**>(arguments.SFD);
+        fusion_args.norm_constant_ptr = static_cast<ElementCompute const*>(arguments.norm_constant);
+      }
+
+      return GroupedGemmOperation3xBase<Operator>::update_fusion_args(fusion_args, arguments);
+    }
+  };
+
+public:
+  /// Returns success if the operation can proceed
+  Status can_implement([[maybe_unused]] void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+    GroupedGemmBlockScaledArguments const* arguments =
+      static_cast<GroupedGemmBlockScaledArguments const*>(arguments_ptr);
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Operator::can_implement(args);
+    return status;
+  }
+
+  Status update_arguments_(
+    OperatorArguments& operator_args,
+    GroupedGemmBlockScaledArguments const* arguments) const {
+    Status status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread,
+      *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.mainloop.ptr_SFA =
+      static_cast<const typename Operator::GemmKernel::ElementSF**>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB =
+      static_cast<const typename Operator::GemmKernel::ElementSF**>(arguments->SFB);
+
+    operator_args.mainloop.layout_SFA =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFA*>(this->layout_SFA_device.data());
+    operator_args.mainloop.layout_SFB =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFB*>(this->layout_SFB_device.data());
+
+    return this->update_arguments_base(operator_args, *arguments);
+  }
+
+  uint64_t get_device_workspace_size(void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+
+    OperatorArguments args;
+    auto status =
+      update_arguments_(args, static_cast<GroupedGemmBlockScaledArguments const*>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  /// **** CAUTION ****
+  /// Must be called when lda, ldb, ldc, or ldd change.
+  /// The CUTLASS library stores the operations in a type-
+  /// erased manifest. Therefore, only this class knows
+  /// the type of strideA, strideB, strideC, and strideD.
+  /// Since grouped GEMM needs to allocate storage for
+  /// the strides on device, the concrete type of the stride
+  /// must be known in order to copy in the correct memory
+  /// layout on device.
+  Status initialize(
+    void const* configuration_ptr,
+    void* host_workspace,
+    void* device_workspace,
+    cudaStream_t stream = nullptr) const override {
+
+    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
+    auto status = this->initialize_strides(config);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    auto num_groups = config.problem_count;
+    this->layout_SFA_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups);
+    this->layout_SFB_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups);
+    auto layout_SFA_host = std::vector<typename CollectiveMainloop::InternalLayoutSFA>(num_groups);
+    auto layout_SFB_host = std::vector<typename CollectiveMainloop::InternalLayoutSFB>(num_groups);
+
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+      auto const& shape = config.problem_sizes_3x_host[group_idx];
+      auto M = get<0>(shape);
+      auto N = get<1>(shape);
+      auto K = get<2>(shape);
+
+      auto layout_SFA = CollectiveMainloop::Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+      auto layout_SFB = CollectiveMainloop::Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+      layout_SFA_host[group_idx] = layout_SFA;
+      layout_SFB_host[group_idx] = layout_SFB;
+    }
+
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFA_device.data(),
+      layout_SFA_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFB_device.data(),
+      layout_SFB_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups,
+      cudaMemcpyHostToDevice));
+
+    Operator* op = new (host_workspace) Operator;
+    return status;
+  }
+
+  /// **** CAUTION ****
+  /// initialize() must be called if lda, ldb, ldc, or ldd change.
+  Status run(
+    void const* arguments_ptr,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+    auto const& args = *static_cast<GroupedGemmBlockScaledArguments const*>(arguments_ptr);
+
+    Status status = update_arguments_(operator_args, &args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator* op = static_cast<Operator*>(host_workspace);
+    status = op->run(operator_args, device_workspace, stream, nullptr);
+    return status;
+  }
+};
 
 } // namespace cutlass::library
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
index d1871e15..c34ac038 100644
--- a/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
@@ -85,7 +85,7 @@ public:
 
     /// Parses the problem
     Status parse(
-      library::GemmDescription const& operation_desc,
+      library::GroupedGemmDescription const& operation_desc,
       ProblemSpace const& problem_space,
       ProblemSpace::Problem const& problem);
 
@@ -94,27 +94,50 @@ public:
     int64_t k(int group_idx) const { return problem_sizes[group_idx].k(); };
 
     /// Total number of bytes loaded
-    int64_t bytes(library::GemmDescription const& operation_desc) const;
+    int64_t bytes(library::GroupedGemmDescription const& operation_desc) const;
 
     /// Total number of flops computed
-    int64_t flops(library::GemmDescription const& operation_desc) const;
+    int64_t flops(library::GroupedGemmDescription const& operation_desc) const;
 
     /// Initializes a performance result
     void initialize_result(
       PerformanceResult& result,
-      library::GemmDescription const& operation_desc,
+      library::GroupedGemmDescription const& operation_desc,
       ProblemSpace const& problem_space);
   };
 
+  struct BlockScalingWorkspace {
+    // host vector (per L2 workspace) of device vectors (per group) of device pointers
+    std::vector<DeviceAllocation*> SFA_ptr_array_device;
+    std::vector<DeviceAllocation*> SFB_ptr_array_device;
+    std::vector<DeviceAllocation*> SFC_ptr_array_device;
+    std::vector<DeviceAllocation*> SFD_ptr_array_device;
+
+    // host vector (per group) of device tensors
+    // (where each batch of device allocation is for a L2 workspace)
+    std::vector<DeviceAllocation*> SFA_ptr_array_host;
+    std::vector<DeviceAllocation*> SFB_ptr_array_host;
+    std::vector<DeviceAllocation*> SFC_ptr_array_host;
+    std::vector<DeviceAllocation*> SFD_ptr_array_host;
+    std::vector<DeviceAllocation*> SFD_reference_ptr_array_host;
+
+    // matrix wide constant, not per-batch or per-group
+    DeviceAllocation* norm_constant;
+  };
+
   // workspace contains the allocated blocks, arguments just contain the raw
   // pointers
   struct GroupedGemmWorkspace {
 
+    // host vector (per L2 workspace) of device vectors (per group) of device pointers
     std::vector<DeviceAllocation*> A_ptr_array_device;
     std::vector<DeviceAllocation*> B_ptr_array_device;
     std::vector<DeviceAllocation*> C_ptr_array_device;
     std::vector<DeviceAllocation*> D_ptr_array_device;
     std::vector<DeviceAllocation*> reference_ptr_array_host;
+
+    // host vector (per group) of device tensors
+    // (where each batch of device allocation is for a L2 workspace)
     std::vector<DeviceAllocation*> A_ptr_array_host;
     std::vector<DeviceAllocation*> B_ptr_array_host;
     std::vector<DeviceAllocation*> C_ptr_array_host;
@@ -122,7 +145,7 @@ public:
 
     /// Number of copies of the problem workspace which are visited sequentially during
     /// profiling to avoid camping in the last level cache.
-    /// *NOT* the number of groups in the grouped GEMM
+    /// *NOT* the number of groups in the grouped GEMM (we use `num_groups` in the profiler)
     int problem_count{1};
 
     DeviceAllocation* problem_sizes_array_device{nullptr};
@@ -132,8 +155,10 @@ public:
     DeviceAllocation* ldc_array_device{nullptr};
     DeviceAllocation* ldd_array_device{nullptr};
 
+    std::optional<BlockScalingWorkspace> block_scales;
+
     library::GemmGroupedConfiguration configuration;
-    library::GemmGroupedArguments arguments;
+    library::GroupedGemmBlockScaledArguments arguments;
 
     std::vector<uint8_t> host_workspace;
     DeviceAllocation device_workspace;
@@ -141,28 +166,38 @@ public:
 
 private:
   void init_arguments(Options const& options) {
-    gemm_workspace_.arguments.ptr_A = gemm_workspace_.A_ptr_array_device[0]->data();
-    gemm_workspace_.arguments.ptr_B = gemm_workspace_.B_ptr_array_device[0]->data();
-    gemm_workspace_.arguments.ptr_C = gemm_workspace_.C_ptr_array_device[0]->data();
-    gemm_workspace_.arguments.ptr_D = gemm_workspace_.D_ptr_array_device[0]->data();
-    gemm_workspace_.arguments.alpha = problem_.alpha.data();
-    gemm_workspace_.arguments.beta = problem_.beta.data();
-    gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
-    gemm_workspace_.arguments.lda = static_cast<int64_t*>(gemm_workspace_.lda_array_device->data());
-    gemm_workspace_.arguments.ldb = static_cast<int64_t*>(gemm_workspace_.ldb_array_device->data());
-    gemm_workspace_.arguments.ldc = static_cast<int64_t*>(gemm_workspace_.ldc_array_device->data());
-    gemm_workspace_.arguments.ldd = static_cast<int64_t*>(gemm_workspace_.ldc_array_device->data());
-    gemm_workspace_.arguments.problem_sizes =
+    auto& arguments = gemm_workspace_.arguments;
+    // these get updated in each profiler run to ensure L2 cycling
+    arguments.ptr_A = gemm_workspace_.A_ptr_array_device[0]->data();
+    arguments.ptr_B = gemm_workspace_.B_ptr_array_device[0]->data();
+    arguments.ptr_C = gemm_workspace_.C_ptr_array_device[0]->data();
+    arguments.ptr_D = gemm_workspace_.D_ptr_array_device[0]->data();
+
+    arguments.alpha = problem_.alpha.data();
+    arguments.beta = problem_.beta.data();
+    arguments.pointer_mode = library::ScalarPointerMode::kHost;
+    arguments.lda = static_cast<int64_t*>(gemm_workspace_.lda_array_device->data());
+    arguments.ldb = static_cast<int64_t*>(gemm_workspace_.ldb_array_device->data());
+    arguments.ldc = static_cast<int64_t*>(gemm_workspace_.ldc_array_device->data());
+    arguments.ldd = static_cast<int64_t*>(gemm_workspace_.ldc_array_device->data());
+    arguments.problem_sizes =
       static_cast<gemm::GemmCoord*>(gemm_workspace_.problem_sizes_array_device->data());
-    gemm_workspace_.arguments.problem_sizes_3x = static_cast<cute::Shape<int, int, int>*>(
+    arguments.problem_sizes_3x = static_cast<cute::Shape<int, int, int>*>(
       gemm_workspace_.problem_sizes_3x_array_device->data());
     gemm_workspace_.arguments.problem_sizes_3x_host = problem_.problem_sizes_3x.data();
     gemm_workspace_.arguments.problem_count = problem_.problem_sizes.size();
-    gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)}; 
-    gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)}; 
+    gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)};
+    gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)};
 
     /* Query device SM count to pass onto the kernel as an argument, where needed */
-    gemm_workspace_.arguments.sm_count = options.device.properties[0].multiProcessorCount;
+    arguments.sm_count = options.device.properties[0].multiProcessorCount;
+    if (is_block_scaled) {
+      auto& block_scaled_ws = gemm_workspace_.block_scales.value();
+      arguments.SFA = block_scaled_ws.SFA_ptr_array_device[0]->data();
+      arguments.SFB = block_scaled_ws.SFB_ptr_array_device[0]->data();
+      arguments.SFD = block_scaled_ws.SFD_ptr_array_device[0]->data();
+      arguments.norm_constant = block_scaled_ws.norm_constant->data();
+    }
   }
 
 protected:
@@ -172,6 +207,8 @@ protected:
   /// Device memory allocations
   GroupedGemmWorkspace gemm_workspace_;
 
+  bool is_block_scaled{false};
+
 public:
   GroupedGemmOperationProfiler(Options const& options);
 
@@ -226,7 +263,7 @@ protected:
   void initialize_result_(
     PerformanceResult& result,
     Options const& options,
-    library::GemmDescription const& operation_desc,
+    library::GroupedGemmDescription const& operation_desc,
     ProblemSpace const& problem_space);
 
   /// Verifies CUTLASS against host and device references
@@ -249,10 +286,6 @@ protected:
     void* host_workspace,
     void* device_workspace) override;
 
-  /// Initialize reduction problem dimensions and library::Operation
-  bool initialize_reduction_configuration_(
-    library::Operation const* operation,
-    ProblemSpace::Problem const& problem);
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/grouped_gemm_operation_profiler.cu b/tools/profiler/src/grouped_gemm_operation_profiler.cu
index 56928a9e..e4d1e6bb 100644
--- a/tools/profiler/src/grouped_gemm_operation_profiler.cu
+++ b/tools/profiler/src/grouped_gemm_operation_profiler.cu
@@ -35,6 +35,8 @@
 #include <bitset>
 #include <cstdint>
 #include <iostream>
+#include <memory>
+#include <optional>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -45,6 +47,8 @@
 #include "cutlass/profiler/grouped_gemm_operation_profiler.h"
 #include "cutlass/library/handle.h"
 #include "cutlass/library/library.h"
+#include "cutlass/library/operation_table.h"
+#include "cutlass/library/singleton.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace {
@@ -161,7 +165,7 @@ void GroupedGemmOperationProfiler::print_examples(std::ostream& out) const {
 }
 
 Status GroupedGemmOperationProfiler::GroupedGemmProblem::parse(
-  library::GemmDescription const& operation_desc,
+  library::GroupedGemmDescription const& operation_desc,
   ProblemSpace const& problem_space,
   ProblemSpace::Problem const& problem) {
 
@@ -242,7 +246,8 @@ Status GroupedGemmOperationProfiler::GroupedGemmProblem::parse(
       if (iss >> m >> sep1 >> n >> sep2 >> k && sep1 == 'x' && sep2 == 'x' && !(iss >> remaining)) {
         problem_sizes.emplace_back(m, n, k);
         problem_sizes_3x.emplace_back(m, n, k);
-      } else {
+      }
+      else {
         throw std::runtime_error(
           "Invalid format in line: " + line + ". Each line in file expected to be 'mxnxk'.");
       }
@@ -281,37 +286,42 @@ Status GroupedGemmOperationProfiler::GroupedGemmProblem::parse(
 
   this->mode = library::GemmUniversalMode::kGrouped;
 
-  if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
+  if (!tensor_description_satisfies(operation_desc.gemm.A, "A", problem_space, problem)) {
     return Status::kErrorInvalidProblem;
   }
 
-  if (!tensor_description_satisfies(operation_desc.B, "B", problem_space, problem)) {
+  if (!tensor_description_satisfies(operation_desc.gemm.B, "B", problem_space, problem)) {
     return Status::kErrorInvalidProblem;
   }
 
-  if (!tensor_description_satisfies(operation_desc.C, "C", problem_space, problem)) {
+  if (!tensor_description_satisfies(operation_desc.gemm.C, "C", problem_space, problem)) {
     return Status::kErrorInvalidProblem;
   }
 
-  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+  if (!tensor_description_satisfies(operation_desc.gemm.D, "D", problem_space, problem)) {
     return Status::kErrorInvalidProblem;
   }
 
   if (!arg_as_scalar(
         this->alpha,
-        operation_desc.element_epilogue,
+        operation_desc.gemm.element_epilogue,
         "alpha",
         problem_space,
         problem)) {
 
-    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
+    if (!cast_from_double(this->alpha, operation_desc.gemm.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
 
-  if (!arg_as_scalar(this->beta, operation_desc.element_epilogue, "beta", problem_space, problem)) {
+  if (!arg_as_scalar(
+        this->beta,
+        operation_desc.gemm.element_epilogue,
+        "beta",
+        problem_space,
+        problem)) {
 
-    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
+    if (!cast_from_double(this->beta, operation_desc.gemm.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
   }
@@ -322,17 +332,17 @@ Status GroupedGemmOperationProfiler::GroupedGemmProblem::parse(
   this->ldc.resize(num_groups);
   for (size_t group_idx = 0; group_idx < num_groups; group_idx++) {
     this->lda[group_idx] = DeviceAllocation::get_packed_layout(
-                             operation_desc.A.layout,
+                             operation_desc.gemm.A.layout,
                              {int(this->m(group_idx)), int(this->k(group_idx))})
                              .front();
 
     this->ldb[group_idx] = DeviceAllocation::get_packed_layout(
-                             operation_desc.B.layout,
+                             operation_desc.gemm.B.layout,
                              {int(this->k(group_idx)), int(this->n(group_idx))})
                              .front();
 
     this->ldc[group_idx] = DeviceAllocation::get_packed_layout(
-                             operation_desc.C.layout,
+                             operation_desc.gemm.C.layout,
                              {int(this->m(group_idx)), int(this->n(group_idx))})
                              .front();
   }
@@ -342,23 +352,23 @@ Status GroupedGemmOperationProfiler::GroupedGemmProblem::parse(
 
 /// Total number of bytes loaded
 int64_t GroupedGemmOperationProfiler::GroupedGemmProblem::bytes(
-  library::GemmDescription const& operation_desc) const {
+  library::GroupedGemmDescription const& operation_desc) const {
   // Input bytes read and Output bytes written for the gemm problem
   int64_t bytes = 0;
   for (size_t group_idx = 0, num_groups = problem_sizes.size(); group_idx < num_groups;
        group_idx++) {
 
     bytes +=
-      int64_t(library::sizeof_bits(operation_desc.A.element) * m(group_idx) / 8) * k(group_idx) +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * n(group_idx) / 8) * k(group_idx) +
-      int64_t(library::sizeof_bits(operation_desc.C.element) * m(group_idx) / 8) * n(group_idx);
+      int64_t(library::sizeof_bits(operation_desc.gemm.A.element) * m(group_idx) / 8) * k(group_idx) +
+      int64_t(library::sizeof_bits(operation_desc.gemm.B.element) * n(group_idx) / 8) * k(group_idx) +
+      int64_t(library::sizeof_bits(operation_desc.gemm.C.element) * m(group_idx) / 8) * n(group_idx);
 
     // Set is_beta_zero true if beta is zero
     bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i == 0; });
     // Output bytes read for the gemm problem for non-zero beta values
     if (!is_beta_zero) {
       bytes +=
-        int64_t(library::sizeof_bits(operation_desc.C.element) * m(group_idx) / 8) * n(group_idx);
+        int64_t(library::sizeof_bits(operation_desc.gemm.C.element) * m(group_idx) / 8) * n(group_idx);
     }
   }
 
@@ -367,7 +377,7 @@ int64_t GroupedGemmOperationProfiler::GroupedGemmProblem::bytes(
 
 /// Total number of flops computed
 int64_t GroupedGemmOperationProfiler::GroupedGemmProblem::flops(
-  library::GemmDescription const& operation_desc) const {
+  library::GroupedGemmDescription const& operation_desc) const {
   int64_t flops_ = 0;
   for (size_t group_idx = 0, num_groups = problem_sizes.size(); group_idx < num_groups;
        group_idx++) {
@@ -376,7 +386,7 @@ int64_t GroupedGemmOperationProfiler::GroupedGemmProblem::flops(
   }
 
   // complex-valued support
-  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  switch (operation_desc.gemm.tile_description.math_instruction.math_operation) {
   case library::MathOperationID::kMultiplyAddComplex:
   case library::MathOperationID::kMultiplyAddComplexFastF32:
     flops_ *= 4;
@@ -395,40 +405,44 @@ int64_t GroupedGemmOperationProfiler::GroupedGemmProblem::flops(
 /// Initializes a performance result
 void GroupedGemmOperationProfiler::GroupedGemmProblem::initialize_result(
   PerformanceResult& result,
-  library::GemmDescription const& operation_desc,
+  library::GroupedGemmDescription const& operation_desc,
   ProblemSpace const& problem_space) {
 
   result.arguments.resize(problem_space.rank());
 
-  set_argument(result, "gemm_kind", problem_space, library::to_string(operation_desc.gemm_kind));
+  set_argument(
+    result,
+    "gemm_kind",
+    problem_space,
+    library::to_string(operation_desc.gemm.gemm_kind));
 
   set_argument(
     result,
     "A",
     problem_space,
-    std::string(library::to_string(operation_desc.A.element)) + ":" +
-      library::to_string(operation_desc.A.layout));
+    std::string(library::to_string(operation_desc.gemm.A.element)) + ":" +
+      library::to_string(operation_desc.gemm.A.layout));
 
   set_argument(
     result,
     "B",
     problem_space,
-    std::string(library::to_string(operation_desc.B.element)) + ":" +
-      library::to_string(operation_desc.B.layout));
+    std::string(library::to_string(operation_desc.gemm.B.element)) + ":" +
+      library::to_string(operation_desc.gemm.B.layout));
 
   set_argument(
     result,
     "C",
     problem_space,
-    std::string(library::to_string(operation_desc.C.element)) + ":" +
-      library::to_string(operation_desc.C.layout));
+    std::string(library::to_string(operation_desc.gemm.C.element)) + ":" +
+      library::to_string(operation_desc.gemm.C.layout));
 
   set_argument(
     result,
     "D",
     problem_space,
-    std::string(library::to_string(operation_desc.D.element)) + ":" +
-      library::to_string(operation_desc.D.layout));
+    std::string(library::to_string(operation_desc.gemm.D.element)) + ":" +
+      library::to_string(operation_desc.gemm.D.layout));
 
   {
     std::stringstream ss;
@@ -456,13 +470,13 @@ void GroupedGemmOperationProfiler::GroupedGemmProblem::initialize_result(
     result,
     "alpha",
     problem_space,
-    library::lexical_cast(alpha, operation_desc.element_epilogue));
+    library::lexical_cast(alpha, operation_desc.gemm.element_epilogue));
 
   set_argument(
     result,
     "beta",
     problem_space,
-    library::lexical_cast(beta, operation_desc.element_epilogue));
+    library::lexical_cast(beta, operation_desc.gemm.element_epilogue));
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -476,10 +490,23 @@ Status GroupedGemmOperationProfiler::initialize_configuration(
   ProblemSpace const& problem_space,
   ProblemSpace::Problem const& problem) {
 
-  library::GemmDescription const& operation_desc =
-    static_cast<library::GemmDescription const&>(operation->description());
+  library::GroupedGemmDescription const& operation_desc =
+    static_cast<library::GroupedGemmDescription const&>(operation->description());
 
-  if (operation_desc.gemm_kind != library::GemmKind::kGrouped) {
+  // We want to share the same operation profiler for any grouped gemm operation.
+  // We distinguish between block scaled and non-block scaled operations by looking at the kernel
+  // name, which tells us what reference kernel to use, which arguments to pass to the operation
+  // etc. This avoids creating yet another OperationProfiler with a lot of boilerplate in it.
+  if (std::string(operation_desc.gemm.name).find("bstensor") != std::string::npos) {
+    is_block_scaled = true;
+    gemm_workspace_.block_scales = BlockScalingWorkspace{};
+  }
+  else {
+    is_block_scaled = false;
+    gemm_workspace_.block_scales = std::nullopt;
+  }
+
+  if (operation_desc.gemm.gemm_kind != library::GemmKind::kGrouped) {
     return Status::kErrorInvalidProblem;
   }
 
@@ -489,10 +516,12 @@ Status GroupedGemmOperationProfiler::initialize_configuration(
   }
 
   auto num_groups = problem_.problem_sizes.size();
-  gemm_workspace_.configuration.problem_count = num_groups;
-  gemm_workspace_.configuration.lda = problem_.lda.data();
-  gemm_workspace_.configuration.ldb = problem_.ldb.data();
-  gemm_workspace_.configuration.ldc = problem_.ldc.data();
+  auto& config = gemm_workspace_.configuration;
+  config.problem_count = num_groups;
+  config.lda = problem_.lda.data();
+  config.ldb = problem_.ldb.data();
+  config.ldc = problem_.ldc.data();
+  config.problem_sizes_3x_host = problem_.problem_sizes_3x.data();
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
 
@@ -503,13 +532,13 @@ Status GroupedGemmOperationProfiler::initialize_configuration(
 void GroupedGemmOperationProfiler::initialize_result_(
   PerformanceResult& result,
   Options const& options,
-  library::GemmDescription const& operation_desc,
+  library::GroupedGemmDescription const& operation_desc,
   ProblemSpace const& problem_space) {
 
   result.provider = library::Provider::kCUTLASS;
   result.disposition = Disposition::kNotRun;
   result.status = Status::kSuccess;
-  result.operation_name = operation_desc.name;
+  result.operation_name = operation_desc.gemm.name;
 
   problem_.initialize_result(result, operation_desc, problem_space);
 
@@ -542,8 +571,8 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
   }
 
   library::Operation const* underlying_operation = operation;
-  library::GemmDescription const& operation_desc =
-    static_cast<library::GemmDescription const&>(operation->description());
+  library::GroupedGemmDescription const& operation_desc =
+    static_cast<library::GroupedGemmDescription const&>(operation->description());
 
   // Compute the number of copies of the problem to avoid L2 camping.
   if (!options.profiling.workspace_count) {
@@ -568,6 +597,14 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
     gemm_workspace_.B_ptr_array_host.resize(num_groups);
     gemm_workspace_.C_ptr_array_host.resize(num_groups);
     gemm_workspace_.D_ptr_array_host.resize(num_groups);
+    if (is_block_scaled) {
+      auto& block_scaling_ws = gemm_workspace_.block_scales.value();
+      block_scaling_ws.SFA_ptr_array_host.resize(num_groups);
+      block_scaling_ws.SFB_ptr_array_host.resize(num_groups);
+      block_scaling_ws.SFC_ptr_array_host.resize(num_groups);
+      block_scaling_ws.SFD_ptr_array_host.resize(num_groups);
+      block_scaling_ws.SFD_reference_ptr_array_host.resize(num_groups);
+    }
     static_assert(sizeof(void*) == 8); // allocating blocks for pointers, so verify pointer size
     // ldx
     gemm_workspace_.lda_array_device =
@@ -608,8 +645,8 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       gemm_workspace_.A_ptr_array_host[group_idx] = device_context.allocate_and_initialize_tensor(
         options,
         "A_" + group_str,
-        operation_desc.A.element,
-        operation_desc.A.layout,
+        operation_desc.gemm.A.element,
+        operation_desc.gemm.A.layout,
         {int(problem_.m(group_idx)), int(problem_.k(group_idx))},
         {int(problem_.lda[group_idx])},
         gemm_workspace_.problem_count,
@@ -618,8 +655,8 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       gemm_workspace_.B_ptr_array_host[group_idx] = device_context.allocate_and_initialize_tensor(
         options,
         "B_" + group_str,
-        operation_desc.B.element,
-        operation_desc.B.layout,
+        operation_desc.gemm.B.element,
+        operation_desc.gemm.B.layout,
         {int(problem_.k(group_idx)), int(problem_.n(group_idx))},
         {int(problem_.ldb[group_idx])},
         gemm_workspace_.problem_count,
@@ -628,8 +665,8 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       gemm_workspace_.C_ptr_array_host[group_idx] = device_context.allocate_and_initialize_tensor(
         options,
         "C_" + group_str,
-        operation_desc.C.element,
-        operation_desc.C.layout,
+        operation_desc.gemm.C.element,
+        operation_desc.gemm.C.layout,
         {int(problem_.m(group_idx)), int(problem_.n(group_idx))},
         {int(problem_.ldc[group_idx])},
         gemm_workspace_.problem_count,
@@ -638,8 +675,8 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       gemm_workspace_.D_ptr_array_host[group_idx] = device_context.allocate_tensor(
         options,
         "D_" + group_str,
-        operation_desc.D.element,
-        operation_desc.D.layout,
+        operation_desc.gemm.D.element,
+        operation_desc.gemm.D.layout,
         {int(problem_.m(group_idx)), int(problem_.n(group_idx))},
         {int(problem_.ldc[group_idx])},
         gemm_workspace_.problem_count,
@@ -648,12 +685,81 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       gemm_workspace_.reference_ptr_array_host[group_idx] = device_context.allocate_tensor(
         options,
         "Reference_" + group_str,
-        operation_desc.D.element,
-        operation_desc.D.layout,
+        operation_desc.gemm.D.element,
+        operation_desc.gemm.D.layout,
         {int(problem_.m(group_idx)), int(problem_.n(group_idx))},
         {int(problem_.ldc[group_idx])},
-        gemm_workspace_.problem_count,
+        1,
         0);
+
+      if (is_block_scaled) {
+        auto const block_scale_desc = operation_desc.block_scales.value();
+        auto& block_scale_ws = gemm_workspace_.block_scales.value();
+        int sfa_m = round_up(int(problem_.m(group_idx)), 128);
+        int sfb_n = round_up(int(problem_.n(group_idx)), 128);
+        int sfa_sfb_k =
+          round_up(ceil_div(int(problem_.k(group_idx)), block_scale_desc.SFVecSize), 4);
+
+        int sfd_m =
+          block_scale_desc.SFD.layout == cutlass::library::LayoutTypeID::kRowMajor
+            ? sfa_m
+            : round_up(ceil_div(int(problem_.m(group_idx)), block_scale_desc.EpilogueSFVecSize), 4);
+        int sfd_n =
+          block_scale_desc.SFD.layout == cutlass::library::LayoutTypeID::kRowMajor
+            ? round_up(ceil_div(int(problem_.n(group_idx)), block_scale_desc.EpilogueSFVecSize), 4)
+            : sfb_n;
+
+        block_scale_ws.SFA_ptr_array_host[group_idx] =
+          device_context.allocate_and_initialize_tensor(
+            options,
+            "SFA",
+            block_scale_desc.SFA.element,
+            block_scale_desc.SFA.layout,
+            {sfa_m, sfa_sfb_k},
+            {sfa_sfb_k},
+            gemm_workspace_.problem_count,
+            seed_shift++,
+            0);
+
+        block_scale_ws.SFB_ptr_array_host[group_idx] =
+          device_context.allocate_and_initialize_tensor(
+            options,
+            "SFB",
+            block_scale_desc.SFB.element,
+            block_scale_desc.SFB.layout,
+            {sfb_n, sfa_sfb_k},
+            {sfa_sfb_k},
+            gemm_workspace_.problem_count,
+            seed_shift++,
+            0);
+
+        block_scale_ws.SFD_ptr_array_host[group_idx] = device_context.allocate_tensor(
+          options,
+          "SFD",
+          block_scale_desc.SFD.element,
+          block_scale_desc.SFD.layout,
+          {sfd_m, sfd_n},
+          {sfd_n},
+          gemm_workspace_.problem_count,
+          0);
+
+        block_scale_ws.SFD_reference_ptr_array_host[group_idx] = device_context.allocate_tensor(
+          options,
+          "Reference_SFD",
+          block_scale_desc.SFD.element,
+          block_scale_desc.SFD.layout,
+          {sfd_m, sfd_n},
+          {sfd_n},
+          gemm_workspace_.problem_count,
+          0);
+
+        // ScaleFactor tensor results may have some holes and will not be touched by the kernel.
+        // If we randomly fill the two tensors, these holes may encounter refcheck errors.
+        if (block_scale_ws.SFD_ptr_array_host[group_idx]->type() != library::NumericTypeID::kVoid) {
+          block_scale_ws.SFD_reference_ptr_array_host[group_idx]->fill_device(0);
+          block_scale_ws.SFD_ptr_array_host[group_idx]->fill_device(0);
+        }
+      }
     }
 
     // takes the allocated tensors and initializes an array of pointers per problem in the workspace
@@ -691,8 +797,36 @@ Status GroupedGemmOperationProfiler::initialize_workspace(
       gemm_workspace_.D_ptr_array_device,
       gemm_workspace_.D_ptr_array_host,
       "D");
+
+    if (is_block_scaled) {
+      auto& block_scale_ws = gemm_workspace_.block_scales.value();
+      create_dev_ptr_array_all_workspace(
+        block_scale_ws.SFA_ptr_array_device,
+        block_scale_ws.SFA_ptr_array_host,
+        "SFA");
+      create_dev_ptr_array_all_workspace(
+        block_scale_ws.SFB_ptr_array_device,
+        block_scale_ws.SFB_ptr_array_host,
+        "SFB");
+      create_dev_ptr_array_all_workspace(
+        block_scale_ws.SFD_ptr_array_device,
+        block_scale_ws.SFD_ptr_array_host,
+        "SFD");
+
+      block_scale_ws.norm_constant = device_context.allocate_and_initialize_tensor(
+        options,
+        "norm_constant",
+        operation_desc.gemm.element_epilogue,
+        operation_desc.gemm.A.layout, // copied, but should this be D layout?
+        {1, 1},
+        {1},
+        1,
+        seed_shift++,
+        0 // device_index
+      );
+    }
+    init_arguments(options);
   }
-  init_arguments(options);
 
   //
   // Initialize the CUTLASS operation
@@ -769,7 +903,6 @@ bool GroupedGemmOperationProfiler::verify_cutlass(
 
   if (results_.back().status != Status::kSuccess) {
     results_.back().disposition = Disposition::kFailed;
-    throw "failed";
     return false;
   }
 
@@ -795,8 +928,8 @@ bool GroupedGemmOperationProfiler::verify_cutlass(
     }
 #endif // #if CUTLASS_ENABLE_CUBLAS
 
-    library::GemmDescription const& gemm_desc =
-      static_cast<library::GemmDescription const&>(operation->description());
+    auto const& desc =
+      static_cast<library::GroupedGemmDescription const&>(operation->description());
 
     bool verification_status = verify_with_reference_(
       options,
@@ -805,8 +938,8 @@ bool GroupedGemmOperationProfiler::verify_cutlass(
       operation,
       problem_space,
       problem,
-      gemm_desc.A.element,
-      gemm_desc.B.element);
+      desc.gemm.A.element,
+      desc.gemm.B.element);
 
     // Update disposition to worst case verification outcome among all
     // verification providers which are supported
@@ -854,8 +987,8 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
   ProblemSpace::Problem const& problem,
   cutlass::library::NumericTypeID element_A,
   cutlass::library::NumericTypeID element_B) {
-  library::GemmDescription const& gemm_desc =
-    static_cast<library::GemmDescription const&>(operation->description());
+  library::GroupedGemmDescription const& desc =
+    static_cast<library::GroupedGemmDescription const&>(operation->description());
 
   for (auto provider : options.verification.providers) {
 
@@ -864,8 +997,15 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
       continue;
     }
 
+    // we only have a block scaled reference kernel implemented on the host
+    if (is_block_scaled && provider != library::Provider::kReferenceHost) {
+      continue;
+    }
+
     auto status = Status::kSuccess;
     auto disposition = Disposition::kFailed;
+    // we don't have grouped GEMM reference kernels so we loop over the groups and perform
+    // a regular GEMM for each group
     for (size_t group_idx = 0, num_groups = problem_.problem_sizes.size(); group_idx < num_groups;
          group_idx++) {
       void* ptr_A = gemm_workspace_.A_ptr_array_host[group_idx]->data();
@@ -879,6 +1019,16 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
       std::vector<uint8_t> host_data_B;
       std::vector<uint8_t> host_data_C;
       std::vector<uint8_t> host_data_D;
+      std::vector<uint8_t> host_data_SFA;
+      std::vector<uint8_t> host_data_SFB;
+      std::vector<uint8_t> host_data_SFC;
+      std::vector<uint8_t> host_data_SFD;
+      std::vector<uint8_t> host_data_norm_constant;
+
+      void* ptr_SFA{nullptr};
+      void* ptr_SFB{nullptr};
+      void* ptr_SFD{nullptr};
+      void* ptr_norm_constant{nullptr};
 
       if (provider == library::Provider::kReferenceHost) {
         host_data_A.resize(gemm_workspace_.A_ptr_array_host[group_idx]->bytes());
@@ -896,52 +1046,171 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
 
         host_data_D.resize(gemm_workspace_.reference_ptr_array_host[group_idx]->bytes());
         ptr_D = host_data_D.data();
+
+        if (is_block_scaled) {
+          auto const& ws = gemm_workspace_.block_scales.value();
+
+          host_data_SFA.resize(ws.SFA_ptr_array_host[group_idx]->bytes());
+          ptr_SFA = host_data_SFA.data();
+          ws.SFA_ptr_array_host[group_idx]->copy_to_host(ptr_SFA);
+          host_data_SFB.resize(ws.SFB_ptr_array_host[group_idx]->bytes());
+          ptr_SFB = host_data_SFB.data();
+          ws.SFB_ptr_array_host[group_idx]->copy_to_host(ptr_SFB);
+
+          host_data_SFD.resize(ws.SFD_reference_ptr_array_host[group_idx]->bytes());
+          ptr_SFD = host_data_SFD.data();
+
+          host_data_norm_constant.resize(ws.norm_constant->bytes());
+          ptr_norm_constant = host_data_norm_constant.data();
+          ws.norm_constant->copy_to_host(ptr_norm_constant);
+        }
       }
 
-      library::Handle handle;
-      handle.set_provider(provider);
+      const auto &desc = static_cast<library::GroupedGemmDescription const &>(operation->description());
+      const auto& gemm_desc = desc.gemm;
 
-      status = handle.gemm_universal(
-        library::GemmUniversalMode::kGemm,
-        problem_.m(group_idx),
-        problem_.n(group_idx),
-        problem_.k(group_idx),
-        problem_.cluster_m,
-        problem_.cluster_n,
-        problem_.cluster_k,
-        problem_.cluster_m_fallback,
-        problem_.cluster_n_fallback,
-        problem_.cluster_k_fallback,
-        gemm_desc.tile_description.math_instruction.element_accumulator,
-        gemm_desc.element_epilogue,
-        problem_.alpha.data(),
-        element_A,
-        gemm_desc.A.layout,
-        gemm_desc.transform_A,
-        ptr_A,
-        int(problem_.lda[group_idx]),
-        element_B,
-        gemm_desc.B.layout,
-        gemm_desc.transform_B,
-        ptr_B,
-        int(problem_.ldb[group_idx]),
-        problem_.beta.data(),
-        gemm_desc.C.element,
-        gemm_desc.C.layout,
-        ptr_C,
-        int(problem_.ldc[group_idx]),
-        gemm_desc.D.element,
-        gemm_desc.D.layout,
-        ptr_D,
-        int(problem_.ldc[group_idx]),
-        1,
-        gemm_workspace_.A_ptr_array_host[group_idx]->batch_stride(),
-        gemm_workspace_.B_ptr_array_host[group_idx]->batch_stride(),
-        gemm_workspace_.C_ptr_array_host[group_idx]->batch_stride(),
-        gemm_workspace_.reference_ptr_array_host[group_idx]->batch_stride());
+      if (!is_block_scaled) {
+        library::Handle handle;
+        handle.set_provider(provider);
 
-      if (status != Status::kSuccess)
+        status = handle.gemm_universal(
+          library::GemmUniversalMode::kGemm,
+          problem_.m(group_idx),
+          problem_.n(group_idx),
+          problem_.k(group_idx),
+          problem_.cluster_m,
+          problem_.cluster_n,
+          problem_.cluster_k,
+          problem_.cluster_m_fallback,
+          problem_.cluster_n_fallback,
+          problem_.cluster_k_fallback,
+          desc.gemm.tile_description.math_instruction.element_accumulator,
+          desc.gemm.element_epilogue,
+          problem_.alpha.data(),
+          element_A,
+          desc.gemm.A.layout,
+          desc.gemm.transform_A,
+          ptr_A,
+          int(problem_.lda[group_idx]),
+          element_B,
+          desc.gemm.B.layout,
+          desc.gemm.transform_B,
+          ptr_B,
+          int(problem_.ldb[group_idx]),
+          problem_.beta.data(),
+          desc.gemm.C.element,
+          desc.gemm.C.layout,
+          ptr_C,
+          int(problem_.ldc[group_idx]),
+          desc.gemm.D.element,
+          desc.gemm.D.layout,
+          ptr_D,
+          int(problem_.ldc[group_idx]),
+          1,
+          gemm_workspace_.A_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.B_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.C_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.reference_ptr_array_host[group_idx]->batch_stride());
+      }
+      else {
+        auto const& block_scale_desc = desc.block_scales.value();
+        auto& block_scale_ws = gemm_workspace_.block_scales.value();
+
+        library::BlockScaledGemmFunctionalKey blockScaledGemm_key(
+          library::Provider::kReferenceHost,
+          library::GemmKind::kUniversal,
+          library::OperationKind::kBlockScaledGemm,
+          gemm_desc.tile_description.math_instruction.element_accumulator,
+          gemm_desc.element_epilogue,
+          element_A,
+          gemm_desc.A.layout,
+          block_scale_desc.SFA.element,
+          element_B,
+          gemm_desc.B.layout,
+          block_scale_desc.SFB.element,
+          gemm_desc.C.element,
+          gemm_desc.C.layout,
+          gemm_desc.D.element,
+          gemm_desc.D.layout,
+          block_scale_desc.SFD.element,
+          block_scale_desc.SFD.layout,
+          block_scale_desc.SFVecSize,
+          block_scale_desc.EpilogueSFVecSize);
+
+        auto operators_it =
+          library::Singleton::get().operation_table.block_scaled_gemm_operations.find(
+            blockScaledGemm_key);
+        if (
+          operators_it ==
+          library::Singleton::get().operation_table.block_scaled_gemm_operations.end()) {
+          disposition = Disposition::kNotSupported;
+          break;
+        }
+
+        if (operators_it->second.empty()) {
+          disposition = Disposition::kNotSupported;
+          break;
+        }
+
+        auto cc_it = operators_it->second.begin();
+        if (cc_it == operators_it->second.end()) {
+          disposition = Disposition::kNotSupported;
+          break;
+        }
+
+        // host reference has only one instances in BlockScaledOperationVectorMap
+        library::Operation const* reference_op = cc_it->second[0];
+        library::BlockScaledGemmArguments arguments{
+          {int(problem_.m(group_idx)), int(problem_.n(group_idx)), int(problem_.k(group_idx))},
+          {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)},
+          {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)},
+          1, // batch count
+          ptr_A,
+          ptr_B,
+          ptr_SFA,
+          ptr_SFB,
+          ptr_C,
+          ptr_D,
+          ptr_SFD,
+          problem_.alpha.data(),
+          problem_.beta.data(),
+          library::ScalarPointerMode::kHost,
+          problem_.lda[group_idx],
+          problem_.ldb[group_idx],
+          problem_.ldc[group_idx],
+          problem_.ldc[group_idx],
+          gemm_workspace_.A_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.B_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.C_ptr_array_host[group_idx]->batch_stride(),
+          gemm_workspace_.reference_ptr_array_host[group_idx]->batch_stride(),
+          ptr_norm_constant};
+
+        library::GemmUniversalConfiguration configuration{
+          library::GemmUniversalMode::kGemm,
+          problem_.problem_sizes[group_idx],
+          {problem_.cluster_m, problem_.cluster_n, problem_.cluster_k},
+          {problem_.cluster_m_fallback, problem_.cluster_n_fallback, problem_.cluster_k_fallback},
+          1,
+          problem_.lda[group_idx],
+          problem_.ldb[group_idx],
+          problem_.ldc[group_idx],
+          problem_.ldc[group_idx],
+          1,
+        };
+        uint64_t host_workspace_size_needed = reference_op->get_host_workspace_size(&gemm_workspace_.configuration);
+        std::vector<char> host_workspace(host_workspace_size_needed);
+        status = reference_op->initialize(&configuration, host_workspace.data());
+        if (status != Status::kSuccess) {
+          break;
+        }
+
+        status = reference_op->run(&arguments, host_workspace.data());
+
+        block_scale_ws.SFD_reference_ptr_array_host[group_idx]->copy_from_host(ptr_SFD);
+      }
+      if (status != Status::kSuccess) {
         break;
+      }
 
       if (provider == library::Provider::kReferenceHost) {
         gemm_workspace_.reference_ptr_array_host[group_idx]->copy_from_host(ptr_D);
@@ -952,26 +1221,40 @@ bool GroupedGemmOperationProfiler::verify_with_reference_(
         *gemm_workspace_.D_ptr_array_host[group_idx],
         *gemm_workspace_.reference_ptr_array_host[group_idx],
         gemm_workspace_.D_ptr_array_host[group_idx]->batch_stride());
-      if (disposition != Disposition::kPassed)
+      if (disposition != Disposition::kPassed) {
         break;
+      }
+
+      if (is_block_scaled) {
+        auto& ws = gemm_workspace_.block_scales.value();
+        auto const& block_scale_desc = desc.block_scales.value();
+        if (block_scale_desc.SFD.element != library::NumericTypeID::kVoid) {
+          disposition = compare_tensors(
+            options,
+            *ws.SFD_ptr_array_host[group_idx],
+            *ws.SFD_reference_ptr_array_host[group_idx],
+            ws.SFD_ptr_array_host[group_idx]->batch_stride());
+          if (disposition != Disposition::kPassed) {
+            break;
+          }
+        }
+      }
     }
     if (status != Status::kSuccess) {
-      results_.back().verification_map[provider] = Disposition::kNotRun;
+      results_.back().verification_map[provider] = Disposition::kNotVerified;
       continue;
     }
     results_.back().status = status;
     results_.back().verification_map[provider] = disposition;
 
-    // Save workspace if incorrect
     if (
       options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[provider] == Disposition::kIncorrect) {
-
-      save_workspace(device_context, options, gemm_desc, library::Provider::kCUTLASS, provider);
+      save_workspace(device_context, options, desc, library::Provider::kCUTLASS, provider);
     }
   }
 
-  return true;
+  return true; // continue profiling
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1008,7 +1291,6 @@ Status GroupedGemmOperationProfiler::profile_cutlass_(
   void* host_workspace,
   void* device_workspace) {
 
-  // initialize gemm underlying operation to handle parallel reduction
   library::Operation const* underlying_operation = operation;
 
   auto func = [&](cudaStream_t stream, int iteration) {
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index f8b5395f..4347bcac 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -138,7 +138,6 @@ bool BlockCompareEqual(
     if (result != cudaSuccess) {
       throw std::runtime_error("Failed to query occupancy.");
     }
-
     // Limit block size. This has the effect of increasing the number of items processed by a
     // single thread and reduces the impact of initialization overhead.
     block_size = (block_size < 128 ? block_size : 128);
@@ -205,7 +204,6 @@ bool BlockCompareRelativelyEqual(
     if (result != cudaSuccess) {
       throw std::runtime_error("Failed to query occupancy.");
     }
-
     // Limit block size. This has the effect of increasing the number of items processed by a
     // single thread and reduces the impact of initialization overhead.
     block_size = (block_size < 128 ? block_size : 128);
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index b2e8d32a..ba2dfd85 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -61,7 +61,6 @@ struct TensorForEach {
       if (result != cudaSuccess) {
         throw std::runtime_error("Failed to query occupancy.");
       }
-
       // Limit block size. This has the effect of increasing the number of items processed by a
       // single thread and reduces the impact of initialization overhead.
       block_size = (block_size < 128 ? block_size : 128);
@@ -124,7 +123,6 @@ struct BlockForEach {
       if (result != cudaSuccess) {
         throw std::runtime_error("Failed to query occupancy.");
       }
-
       // Limit block size. This has the effect of increasing the number of items processed by a
       // single thread and reduces the impact of initialization overhead.
       block_size = (block_size < 128 ? block_size : 128);